1/* 2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ 29/* 30 * Copyright (c) 1989, 1993 31 * The Regents of the University of California. All rights reserved. 32 * (c) UNIX System Laboratories, Inc. 33 * All or some portions of this file are derived from material licensed 34 * to the University of California by American Telephone and Telegraph 35 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 36 * the permission of UNIX System Laboratories, Inc. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. All advertising materials mentioning features or use of this software 47 * must display the following acknowledgement: 48 * This product includes software developed by the University of 49 * California, Berkeley and its contributors. 50 * 4. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 67 */ 68/* 69 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce 70 * support for mandatory and extensible security protections. This notice 71 * is included in support of clause 2.2 (b) of the Apple Public License, 72 * Version 2.0. 73 */ 74 75/* 76 * External virtual filesystem routines 77 */ 78 79 80#include <sys/param.h> 81#include <sys/systm.h> 82#include <sys/proc_internal.h> 83#include <sys/kauth.h> 84#include <sys/mount_internal.h> 85#include <sys/time.h> 86#include <sys/lock.h> 87#include <sys/vnode.h> 88#include <sys/vnode_internal.h> 89#include <sys/stat.h> 90#include <sys/namei.h> 91#include <sys/ucred.h> 92#include <sys/buf_internal.h> 93#include <sys/errno.h> 94#include <sys/malloc.h> 95#include <sys/uio_internal.h> 96#include <sys/uio.h> 97#include <sys/domain.h> 98#include <sys/mbuf.h> 99#include <sys/syslog.h> 100#include <sys/ubc_internal.h> 101#include <sys/vm.h> 102#include <sys/sysctl.h> 103#include <sys/filedesc.h> 104#include <sys/event.h> 105#include <sys/kdebug.h> 106#include <sys/kauth.h> 107#include <sys/user.h> 108#include <miscfs/fifofs/fifo.h> 109 110#include <string.h> 111#include <machine/spl.h> 112 113 114#include <kern/assert.h> 115 116#include <miscfs/specfs/specdev.h> 117 118#include <mach/mach_types.h> 119#include <mach/memory_object_types.h> 120 121#include <kern/kalloc.h> /* kalloc()/kfree() */ 122#include <kern/clock.h> /* delay_for_interval() */ 123#include <libkern/OSAtomic.h> /* OSAddAtomic() */ 124 125 126#include <vm/vm_protos.h> /* vnode_pager_vrele() */ 127 128#if CONFIG_MACF 129#include <security/mac_framework.h> 130#endif 131 132extern lck_grp_t *vnode_lck_grp; 133extern lck_attr_t *vnode_lck_attr; 134 135 136extern lck_mtx_t * mnt_list_mtx_lock; 137 138enum vtype iftovt_tab[16] = { 139 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 140 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 141}; 142int vttoif_tab[9] = { 143 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 144 S_IFSOCK, S_IFIFO, S_IFMT, 145}; 146 147/* XXX next protptype should be from <nfs/nfs.h> */ 148extern int nfs_vinvalbuf(vnode_t, int, vfs_context_t, int); 149 150/* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */ 151__private_extern__ void qsort( 152 void * array, 153 size_t nmembers, 154 size_t member_size, 155 int (*)(const void *, const void *)); 156 157extern kern_return_t adjust_vm_object_cache(vm_size_t oval, vm_size_t nval); 158__private_extern__ void vntblinit(void); 159__private_extern__ kern_return_t reset_vmobjectcache(unsigned int val1, 160 unsigned int val2); 161__private_extern__ int unlink1(vfs_context_t, struct nameidata *, int); 162 163static void vnode_list_add(vnode_t); 164static void vnode_list_remove(vnode_t); 165static void vnode_list_remove_locked(vnode_t); 166 167static errno_t vnode_drain(vnode_t); 168static void vgone(vnode_t, int flags); 169static void vclean(vnode_t vp, int flag); 170static void vnode_reclaim_internal(vnode_t, int, int, int); 171 172static void vnode_dropiocount (vnode_t); 173static errno_t vnode_getiocount(vnode_t vp, int vid, int vflags); 174static int vget_internal(vnode_t, int, int); 175 176static vnode_t checkalias(vnode_t vp, dev_t nvp_rdev); 177static int vnode_reload(vnode_t); 178static int vnode_isinuse_locked(vnode_t, int, int); 179 180static void insmntque(vnode_t vp, mount_t mp); 181static int mount_getvfscnt(void); 182static int mount_fillfsids(fsid_t *, int ); 183static void vnode_iterate_setup(mount_t); 184static int vnode_umount_preflight(mount_t, vnode_t, int); 185static int vnode_iterate_prepare(mount_t); 186static int vnode_iterate_reloadq(mount_t); 187static void vnode_iterate_clear(mount_t); 188 189errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *); 190 191TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ 192TAILQ_HEAD(deadlst, vnode) vnode_dead_list; /* vnode dead list */ 193 194TAILQ_HEAD(ragelst, vnode) vnode_rage_list; /* vnode rapid age list */ 195struct timeval rage_tv; 196int rage_limit = 0; 197int ragevnodes = 0; 198 199#define RAGE_LIMIT_MIN 100 200#define RAGE_TIME_LIMIT 5 201 202struct mntlist mountlist; /* mounted filesystem list */ 203static int nummounts = 0; 204 205#if DIAGNOSTIC 206#define VLISTCHECK(fun, vp, list) \ 207 if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \ 208 panic("%s: %s vnode not on %slist", (fun), (list), (list)); 209#else 210#define VLISTCHECK(fun, vp, list) 211#endif /* DIAGNOSTIC */ 212 213#define VLISTNONE(vp) \ 214 do { \ 215 (vp)->v_freelist.tqe_next = (struct vnode *)0; \ 216 (vp)->v_freelist.tqe_prev = (struct vnode **)0xdeadb; \ 217 } while(0) 218 219#define VONLIST(vp) \ 220 ((vp)->v_freelist.tqe_prev != (struct vnode **)0xdeadb) 221 222/* remove a vnode from free vnode list */ 223#define VREMFREE(fun, vp) \ 224 do { \ 225 VLISTCHECK((fun), (vp), "free"); \ 226 TAILQ_REMOVE(&vnode_free_list, (vp), v_freelist); \ 227 VLISTNONE((vp)); \ 228 freevnodes--; \ 229 } while(0) 230 231 232 233/* remove a vnode from dead vnode list */ 234#define VREMDEAD(fun, vp) \ 235 do { \ 236 VLISTCHECK((fun), (vp), "dead"); \ 237 TAILQ_REMOVE(&vnode_dead_list, (vp), v_freelist); \ 238 VLISTNONE((vp)); \ 239 vp->v_listflag &= ~VLIST_DEAD; \ 240 deadvnodes--; \ 241 } while(0) 242 243 244/* remove a vnode from rage vnode list */ 245#define VREMRAGE(fun, vp) \ 246 do { \ 247 if ( !(vp->v_listflag & VLIST_RAGE)) \ 248 panic("VREMRAGE: vp not on rage list"); \ 249 VLISTCHECK((fun), (vp), "rage"); \ 250 TAILQ_REMOVE(&vnode_rage_list, (vp), v_freelist); \ 251 VLISTNONE((vp)); \ 252 vp->v_listflag &= ~VLIST_RAGE; \ 253 ragevnodes--; \ 254 } while(0) 255 256 257/* 258 * vnodetarget hasn't been used in a long time, but 259 * it was exported for some reason... I'm leaving in 260 * place for now... it should be deprecated out of the 261 * exports and removed eventually. 262 */ 263unsigned long vnodetarget; /* target for vnreclaim() */ 264#define VNODE_FREE_TARGET 20 /* Default value for vnodetarget */ 265 266/* 267 * We need quite a few vnodes on the free list to sustain the 268 * rapid stat() the compilation process does, and still benefit from the name 269 * cache. Having too few vnodes on the free list causes serious disk 270 * thrashing as we cycle through them. 271 */ 272#define VNODE_FREE_MIN CONFIG_VNODE_FREE_MIN /* freelist should have at least this many */ 273 274/* 275 * Initialize the vnode management data structures. 276 */ 277__private_extern__ void 278vntblinit(void) 279{ 280 TAILQ_INIT(&vnode_free_list); 281 TAILQ_INIT(&vnode_rage_list); 282 TAILQ_INIT(&vnode_dead_list); 283 TAILQ_INIT(&mountlist); 284 285 if (!vnodetarget) 286 vnodetarget = VNODE_FREE_TARGET; 287 288 microuptime(&rage_tv); 289 rage_limit = desiredvnodes / 100; 290 291 if (rage_limit < RAGE_LIMIT_MIN) 292 rage_limit = RAGE_LIMIT_MIN; 293 294 /* 295 * Scale the vm_object_cache to accomodate the vnodes 296 * we want to cache 297 */ 298 (void) adjust_vm_object_cache(0, desiredvnodes - VNODE_FREE_MIN); 299} 300 301/* Reset the VM Object Cache with the values passed in */ 302__private_extern__ kern_return_t 303reset_vmobjectcache(unsigned int val1, unsigned int val2) 304{ 305 vm_size_t oval = val1 - VNODE_FREE_MIN; 306 vm_size_t nval; 307 308 if(val2 < VNODE_FREE_MIN) 309 nval = 0; 310 else 311 nval = val2 - VNODE_FREE_MIN; 312 313 return(adjust_vm_object_cache(oval, nval)); 314} 315 316 317/* the timeout is in 10 msecs */ 318int 319vnode_waitforwrites(vnode_t vp, int output_target, int slpflag, int slptimeout, const char *msg) { 320 int error = 0; 321 struct timespec ts; 322 323 KERNEL_DEBUG(0x3010280 | DBG_FUNC_START, (int)vp, output_target, vp->v_numoutput, 0, 0); 324 325 if (vp->v_numoutput > output_target) { 326 327 slpflag &= ~PDROP; 328 329 vnode_lock(vp); 330 331 while ((vp->v_numoutput > output_target) && error == 0) { 332 if (output_target) 333 vp->v_flag |= VTHROTTLED; 334 else 335 vp->v_flag |= VBWAIT; 336 337 ts.tv_sec = (slptimeout/100); 338 ts.tv_nsec = (slptimeout % 1000) * 10 * NSEC_PER_USEC * 1000 ; 339 error = msleep((caddr_t)&vp->v_numoutput, &vp->v_lock, (slpflag | (PRIBIO + 1)), msg, &ts); 340 } 341 vnode_unlock(vp); 342 } 343 KERNEL_DEBUG(0x3010280 | DBG_FUNC_END, (int)vp, output_target, vp->v_numoutput, error, 0); 344 345 return error; 346} 347 348 349void 350vnode_startwrite(vnode_t vp) { 351 352 OSAddAtomic(1, &vp->v_numoutput); 353} 354 355 356void 357vnode_writedone(vnode_t vp) 358{ 359 if (vp) { 360 OSAddAtomic(-1, &vp->v_numoutput); 361 362 if (vp->v_numoutput <= 1) { 363 int need_wakeup = 0; 364 365 vnode_lock_spin(vp); 366 367 if (vp->v_numoutput < 0) 368 panic("vnode_writedone: numoutput < 0"); 369 370 if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= 1)) { 371 vp->v_flag &= ~VTHROTTLED; 372 need_wakeup = 1; 373 } 374 if ((vp->v_flag & VBWAIT) && (vp->v_numoutput == 0)) { 375 vp->v_flag &= ~VBWAIT; 376 need_wakeup = 1; 377 } 378 vnode_unlock(vp); 379 380 if (need_wakeup) 381 wakeup((caddr_t)&vp->v_numoutput); 382 } 383 } 384} 385 386 387 388int 389vnode_hasdirtyblks(vnode_t vp) 390{ 391 struct cl_writebehind *wbp; 392 393 /* 394 * Not taking the buf_mtxp as there is little 395 * point doing it. Even if the lock is taken the 396 * state can change right after that. If their 397 * needs to be a synchronization, it must be driven 398 * by the caller 399 */ 400 if (vp->v_dirtyblkhd.lh_first) 401 return (1); 402 403 if (!UBCINFOEXISTS(vp)) 404 return (0); 405 406 wbp = vp->v_ubcinfo->cl_wbehind; 407 408 if (wbp && (wbp->cl_number || wbp->cl_scmap)) 409 return (1); 410 411 return (0); 412} 413 414int 415vnode_hascleanblks(vnode_t vp) 416{ 417 /* 418 * Not taking the buf_mtxp as there is little 419 * point doing it. Even if the lock is taken the 420 * state can change right after that. If their 421 * needs to be a synchronization, it must be driven 422 * by the caller 423 */ 424 if (vp->v_cleanblkhd.lh_first) 425 return (1); 426 return (0); 427} 428 429void 430vnode_iterate_setup(mount_t mp) 431{ 432 while (mp->mnt_lflag & MNT_LITER) { 433 mp->mnt_lflag |= MNT_LITERWAIT; 434 msleep((caddr_t)mp, &mp->mnt_mlock, PVFS, "vnode_iterate_setup", NULL); 435 } 436 437 mp->mnt_lflag |= MNT_LITER; 438 439} 440 441static int 442vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags) 443{ 444 vnode_t vp; 445 446 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 447 /* disable preflight only for udf, a hack to be removed after 4073176 is fixed */ 448 if (vp->v_tag == VT_UDF) 449 return 0; 450 if (vp->v_type == VDIR) 451 continue; 452 if (vp == skipvp) 453 continue; 454 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || 455 (vp->v_flag & VNOFLUSH))) 456 continue; 457 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) 458 continue; 459 if ((flags & WRITECLOSE) && 460 (vp->v_writecount == 0 || vp->v_type != VREG)) 461 continue; 462 /* Look for busy vnode */ 463 if (((vp->v_usecount != 0) && 464 ((vp->v_usecount - vp->v_kusecount) != 0))) 465 return(1); 466 } 467 468 return(0); 469} 470 471/* 472 * This routine prepares iteration by moving all the vnodes to worker queue 473 * called with mount lock held 474 */ 475int 476vnode_iterate_prepare(mount_t mp) 477{ 478 vnode_t vp; 479 480 if (TAILQ_EMPTY(&mp->mnt_vnodelist)) { 481 /* nothing to do */ 482 return (0); 483 } 484 485 vp = TAILQ_FIRST(&mp->mnt_vnodelist); 486 vp->v_mntvnodes.tqe_prev = &(mp->mnt_workerqueue.tqh_first); 487 mp->mnt_workerqueue.tqh_first = mp->mnt_vnodelist.tqh_first; 488 mp->mnt_workerqueue.tqh_last = mp->mnt_vnodelist.tqh_last; 489 490 TAILQ_INIT(&mp->mnt_vnodelist); 491 if (mp->mnt_newvnodes.tqh_first != NULL) 492 panic("vnode_iterate_prepare: newvnode when entering vnode"); 493 TAILQ_INIT(&mp->mnt_newvnodes); 494 495 return (1); 496} 497 498 499/* called with mount lock held */ 500int 501vnode_iterate_reloadq(mount_t mp) 502{ 503 int moved = 0; 504 505 /* add the remaining entries in workerq to the end of mount vnode list */ 506 if (!TAILQ_EMPTY(&mp->mnt_workerqueue)) { 507 struct vnode * mvp; 508 mvp = TAILQ_LAST(&mp->mnt_vnodelist, vnodelst); 509 510 /* Joining the workerque entities to mount vnode list */ 511 if (mvp) 512 mvp->v_mntvnodes.tqe_next = mp->mnt_workerqueue.tqh_first; 513 else 514 mp->mnt_vnodelist.tqh_first = mp->mnt_workerqueue.tqh_first; 515 mp->mnt_workerqueue.tqh_first->v_mntvnodes.tqe_prev = mp->mnt_vnodelist.tqh_last; 516 mp->mnt_vnodelist.tqh_last = mp->mnt_workerqueue.tqh_last; 517 TAILQ_INIT(&mp->mnt_workerqueue); 518 } 519 520 /* add the newvnodes to the head of mount vnode list */ 521 if (!TAILQ_EMPTY(&mp->mnt_newvnodes)) { 522 struct vnode * nlvp; 523 nlvp = TAILQ_LAST(&mp->mnt_newvnodes, vnodelst); 524 525 mp->mnt_newvnodes.tqh_first->v_mntvnodes.tqe_prev = &mp->mnt_vnodelist.tqh_first; 526 nlvp->v_mntvnodes.tqe_next = mp->mnt_vnodelist.tqh_first; 527 if(mp->mnt_vnodelist.tqh_first) 528 mp->mnt_vnodelist.tqh_first->v_mntvnodes.tqe_prev = &nlvp->v_mntvnodes.tqe_next; 529 else 530 mp->mnt_vnodelist.tqh_last = mp->mnt_newvnodes.tqh_last; 531 mp->mnt_vnodelist.tqh_first = mp->mnt_newvnodes.tqh_first; 532 TAILQ_INIT(&mp->mnt_newvnodes); 533 moved = 1; 534 } 535 536 return(moved); 537} 538 539 540void 541vnode_iterate_clear(mount_t mp) 542{ 543 mp->mnt_lflag &= ~MNT_LITER; 544 if (mp->mnt_lflag & MNT_LITERWAIT) { 545 mp->mnt_lflag &= ~MNT_LITERWAIT; 546 wakeup(mp); 547 } 548} 549 550 551int 552vnode_iterate(mount_t mp, int flags, int (*callout)(struct vnode *, void *), 553 void *arg) 554{ 555 struct vnode *vp; 556 int vid, retval; 557 int ret = 0; 558 559 mount_lock(mp); 560 561 vnode_iterate_setup(mp); 562 563 /* it is returns 0 then there is nothing to do */ 564 retval = vnode_iterate_prepare(mp); 565 566 if (retval == 0) { 567 vnode_iterate_clear(mp); 568 mount_unlock(mp); 569 return(ret); 570 } 571 572 /* iterate over all the vnodes */ 573 while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) { 574 vp = TAILQ_FIRST(&mp->mnt_workerqueue); 575 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes); 576 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes); 577 vid = vp->v_id; 578 if ((vp->v_data == NULL) || (vp->v_type == VNON) || (vp->v_mount != mp)) { 579 continue; 580 } 581 mount_unlock(mp); 582 583 if ( vget_internal(vp, vid, (flags | VNODE_NODEAD| VNODE_WITHID | VNODE_NOSUSPEND))) { 584 mount_lock(mp); 585 continue; 586 } 587 if (flags & VNODE_RELOAD) { 588 /* 589 * we're reloading the filesystem 590 * cast out any inactive vnodes... 591 */ 592 if (vnode_reload(vp)) { 593 /* vnode will be recycled on the refcount drop */ 594 vnode_put(vp); 595 mount_lock(mp); 596 continue; 597 } 598 } 599 600 retval = callout(vp, arg); 601 602 switch (retval) { 603 case VNODE_RETURNED: 604 case VNODE_RETURNED_DONE: 605 vnode_put(vp); 606 if (retval == VNODE_RETURNED_DONE) { 607 mount_lock(mp); 608 ret = 0; 609 goto out; 610 } 611 break; 612 613 case VNODE_CLAIMED_DONE: 614 mount_lock(mp); 615 ret = 0; 616 goto out; 617 case VNODE_CLAIMED: 618 default: 619 break; 620 } 621 mount_lock(mp); 622 } 623 624out: 625 (void)vnode_iterate_reloadq(mp); 626 vnode_iterate_clear(mp); 627 mount_unlock(mp); 628 return (ret); 629} 630 631void 632mount_lock_renames(mount_t mp) 633{ 634 lck_mtx_lock(&mp->mnt_renamelock); 635} 636 637void 638mount_unlock_renames(mount_t mp) 639{ 640 lck_mtx_unlock(&mp->mnt_renamelock); 641} 642 643void 644mount_lock(mount_t mp) 645{ 646 lck_mtx_lock(&mp->mnt_mlock); 647} 648 649void 650mount_unlock(mount_t mp) 651{ 652 lck_mtx_unlock(&mp->mnt_mlock); 653} 654 655 656void 657mount_ref(mount_t mp, int locked) 658{ 659 if ( !locked) 660 mount_lock(mp); 661 662 mp->mnt_count++; 663 664 if ( !locked) 665 mount_unlock(mp); 666} 667 668 669void 670mount_drop(mount_t mp, int locked) 671{ 672 if ( !locked) 673 mount_lock(mp); 674 675 mp->mnt_count--; 676 677 if (mp->mnt_count == 0 && (mp->mnt_lflag & MNT_LDRAIN)) 678 wakeup(&mp->mnt_lflag); 679 680 if ( !locked) 681 mount_unlock(mp); 682} 683 684 685int 686mount_iterref(mount_t mp, int locked) 687{ 688 int retval = 0; 689 690 if (!locked) 691 mount_list_lock(); 692 if (mp->mnt_iterref < 0) { 693 retval = 1; 694 } else { 695 mp->mnt_iterref++; 696 } 697 if (!locked) 698 mount_list_unlock(); 699 return(retval); 700} 701 702int 703mount_isdrained(mount_t mp, int locked) 704{ 705 int retval; 706 707 if (!locked) 708 mount_list_lock(); 709 if (mp->mnt_iterref < 0) 710 retval = 1; 711 else 712 retval = 0; 713 if (!locked) 714 mount_list_unlock(); 715 return(retval); 716} 717 718void 719mount_iterdrop(mount_t mp) 720{ 721 mount_list_lock(); 722 mp->mnt_iterref--; 723 wakeup(&mp->mnt_iterref); 724 mount_list_unlock(); 725} 726 727void 728mount_iterdrain(mount_t mp) 729{ 730 mount_list_lock(); 731 while (mp->mnt_iterref) 732 msleep((caddr_t)&mp->mnt_iterref, mnt_list_mtx_lock, PVFS, "mount_iterdrain", NULL); 733 /* mount iterations drained */ 734 mp->mnt_iterref = -1; 735 mount_list_unlock(); 736} 737void 738mount_iterreset(mount_t mp) 739{ 740 mount_list_lock(); 741 if (mp->mnt_iterref == -1) 742 mp->mnt_iterref = 0; 743 mount_list_unlock(); 744} 745 746/* always called with mount lock held */ 747int 748mount_refdrain(mount_t mp) 749{ 750 if (mp->mnt_lflag & MNT_LDRAIN) 751 panic("already in drain"); 752 mp->mnt_lflag |= MNT_LDRAIN; 753 754 while (mp->mnt_count) 755 msleep((caddr_t)&mp->mnt_lflag, &mp->mnt_mlock, PVFS, "mount_drain", NULL); 756 757 if (mp->mnt_vnodelist.tqh_first != NULL) 758 panic("mount_refdrain: dangling vnode"); 759 760 mp->mnt_lflag &= ~MNT_LDRAIN; 761 762 return(0); 763} 764 765 766/* 767 * Mark a mount point as busy. Used to synchronize access and to delay 768 * unmounting. 769 */ 770int 771vfs_busy(mount_t mp, int flags) 772{ 773 774restart: 775 if (mp->mnt_lflag & MNT_LDEAD) 776 return(ENOENT); 777 778 if (mp->mnt_lflag & MNT_LUNMOUNT) { 779 if (flags & LK_NOWAIT) 780 return (ENOENT); 781 782 mount_lock(mp); 783 784 if (mp->mnt_lflag & MNT_LDEAD) { 785 mount_unlock(mp); 786 return(ENOENT); 787 } 788 if (mp->mnt_lflag & MNT_LUNMOUNT) { 789 mp->mnt_lflag |= MNT_LWAIT; 790 /* 791 * Since all busy locks are shared except the exclusive 792 * lock granted when unmounting, the only place that a 793 * wakeup needs to be done is at the release of the 794 * exclusive lock at the end of dounmount. 795 */ 796 msleep((caddr_t)mp, &mp->mnt_mlock, (PVFS | PDROP), "vfsbusy", NULL); 797 return (ENOENT); 798 } 799 mount_unlock(mp); 800 } 801 802 lck_rw_lock_shared(&mp->mnt_rwlock); 803 804 /* 805 * until we are granted the rwlock, it's possible for the mount point to 806 * change state, so reevaluate before granting the vfs_busy 807 */ 808 if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) { 809 lck_rw_done(&mp->mnt_rwlock); 810 goto restart; 811 } 812 return (0); 813} 814 815/* 816 * Free a busy filesystem. 817 */ 818 819void 820vfs_unbusy(mount_t mp) 821{ 822 lck_rw_done(&mp->mnt_rwlock); 823} 824 825 826 827static void 828vfs_rootmountfailed(mount_t mp) { 829 830 mount_list_lock(); 831 mp->mnt_vtable->vfc_refcount--; 832 mount_list_unlock(); 833 834 vfs_unbusy(mp); 835 836 mount_lock_destroy(mp); 837 838#if CONFIG_MACF 839 mac_mount_label_destroy(mp); 840#endif 841 842 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT); 843} 844 845/* 846 * Lookup a filesystem type, and if found allocate and initialize 847 * a mount structure for it. 848 * 849 * Devname is usually updated by mount(8) after booting. 850 */ 851static mount_t 852vfs_rootmountalloc_internal(struct vfstable *vfsp, const char *devname) 853{ 854 mount_t mp; 855 856 mp = _MALLOC_ZONE((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 857 bzero((char *)mp, (u_long)sizeof(struct mount)); 858 859 /* Initialize the default IO constraints */ 860 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS; 861 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32; 862 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt; 863 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt; 864 mp->mnt_devblocksize = DEV_BSIZE; 865 mp->mnt_alignmentmask = PAGE_MASK; 866 mp->mnt_ioflags = 0; 867 mp->mnt_realrootvp = NULLVP; 868 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL; 869 870 mount_lock_init(mp); 871 (void)vfs_busy(mp, LK_NOWAIT); 872 873 TAILQ_INIT(&mp->mnt_vnodelist); 874 TAILQ_INIT(&mp->mnt_workerqueue); 875 TAILQ_INIT(&mp->mnt_newvnodes); 876 877 mp->mnt_vtable = vfsp; 878 mp->mnt_op = vfsp->vfc_vfsops; 879 mp->mnt_flag = MNT_RDONLY | MNT_ROOTFS; 880 mp->mnt_vnodecovered = NULLVP; 881 //mp->mnt_stat.f_type = vfsp->vfc_typenum; 882 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; 883 884 mount_list_lock(); 885 vfsp->vfc_refcount++; 886 mount_list_unlock(); 887 888 strncpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN); 889 mp->mnt_vfsstat.f_mntonname[0] = '/'; 890 /* XXX const poisoning layering violation */ 891 (void) copystr((const void *)devname, mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN - 1, NULL); 892 893#if CONFIG_MACF 894 mac_mount_label_init(mp); 895 mac_mount_label_associate(vfs_context_kernel(), mp); 896#endif 897 return (mp); 898} 899 900errno_t 901vfs_rootmountalloc(const char *fstypename, const char *devname, mount_t *mpp) 902{ 903 struct vfstable *vfsp; 904 905 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 906 if (!strncmp(vfsp->vfc_name, fstypename, 907 sizeof(vfsp->vfc_name))) 908 break; 909 if (vfsp == NULL) 910 return (ENODEV); 911 912 *mpp = vfs_rootmountalloc_internal(vfsp, devname); 913 914 if (*mpp) 915 return (0); 916 917 return (ENOMEM); 918} 919 920 921/* 922 * Find an appropriate filesystem to use for the root. If a filesystem 923 * has not been preselected, walk through the list of known filesystems 924 * trying those that have mountroot routines, and try them until one 925 * works or we have tried them all. 926 */ 927extern int (*mountroot)(void); 928 929int 930vfs_mountroot(void) 931{ 932#if CONFIG_MACF 933 struct vnode *vp; 934#endif 935 struct vfstable *vfsp; 936 vfs_context_t ctx = vfs_context_kernel(); 937 struct vfs_attr vfsattr; 938 int error; 939 mount_t mp; 940 vnode_t bdevvp_rootvp; 941 942 if (mountroot != NULL) { 943 /* 944 * used for netboot which follows a different set of rules 945 */ 946 error = (*mountroot)(); 947 return (error); 948 } 949 if ((error = bdevvp(rootdev, &rootvp))) { 950 printf("vfs_mountroot: can't setup bdevvp\n"); 951 return (error); 952 } 953 /* 954 * 4951998 - code we call in vfc_mountroot may replace rootvp 955 * so keep a local copy for some house keeping. 956 */ 957 bdevvp_rootvp = rootvp; 958 959 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 960 if (vfsp->vfc_mountroot == NULL) 961 continue; 962 963 mp = vfs_rootmountalloc_internal(vfsp, "root_device"); 964 mp->mnt_devvp = rootvp; 965 966 if ((error = (*vfsp->vfc_mountroot)(mp, rootvp, ctx)) == 0) { 967 if ( bdevvp_rootvp != rootvp ) { 968 /* 969 * rootvp changed... 970 * bump the iocount and fix up mnt_devvp for the 971 * new rootvp (it will already have a usecount taken)... 972 * drop the iocount and the usecount on the orignal 973 * since we are no longer going to use it... 974 */ 975 vnode_getwithref(rootvp); 976 mp->mnt_devvp = rootvp; 977 978 vnode_rele(bdevvp_rootvp); 979 vnode_put(bdevvp_rootvp); 980 } 981 mp->mnt_devvp->v_specflags |= SI_MOUNTEDON; 982 983 vfs_unbusy(mp); 984 985 mount_list_add(mp); 986 987 /* 988 * cache the IO attributes for the underlying physical media... 989 * an error return indicates the underlying driver doesn't 990 * support all the queries necessary... however, reasonable 991 * defaults will have been set, so no reason to bail or care 992 */ 993 vfs_init_io_attributes(rootvp, mp); 994 995 /* 996 * Shadow the VFC_VFSNATIVEXATTR flag to MNTK_EXTENDED_ATTRS. 997 */ 998 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) { 999 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS; 1000 } 1001 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) { 1002 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT; 1003 } 1004 1005 /* 1006 * Probe root file system for additional features. 1007 */ 1008 (void)VFS_START(mp, 0, ctx); 1009 1010 VFSATTR_INIT(&vfsattr); 1011 VFSATTR_WANTED(&vfsattr, f_capabilities); 1012 if (vfs_getattr(mp, &vfsattr, ctx) == 0 && 1013 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) { 1014 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) && 1015 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) { 1016 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS; 1017 } 1018#if NAMEDSTREAMS 1019 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) && 1020 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) { 1021 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS; 1022 } 1023#endif 1024 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) && 1025 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) { 1026 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID; 1027 } 1028 } 1029 1030 /* 1031 * get rid of iocount reference returned 1032 * by bdevvp (or picked up by us on the substitued 1033 * rootvp)... it (or we) will have also taken 1034 * a usecount reference which we want to keep 1035 */ 1036 vnode_put(rootvp); 1037 1038#if CONFIG_MACF 1039 if ((vfs_flags(mp) & MNT_MULTILABEL) == 0) 1040 return (0); 1041 1042 error = VFS_ROOT(mp, &vp, ctx); 1043 if (error) { 1044 printf("%s() VFS_ROOT() returned %d\n", 1045 __func__, error); 1046 dounmount(mp, MNT_FORCE, 0, ctx); 1047 goto fail; 1048 } 1049 1050 /* VFS_ROOT provides reference so flags = 0 */ 1051 error = vnode_label(mp, NULL, vp, NULL, 0, ctx); 1052 if (error) { 1053 printf("%s() vnode_label() returned %d\n", 1054 __func__, error); 1055 dounmount(mp, MNT_FORCE, 0, ctx); 1056 goto fail; 1057 } 1058#endif 1059 return (0); 1060 } 1061#if CONFIG_MACF 1062fail: 1063#endif 1064 vfs_rootmountfailed(mp); 1065 1066 if (error != EINVAL) 1067 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 1068 } 1069 return (ENODEV); 1070} 1071 1072/* 1073 * Lookup a mount point by filesystem identifier. 1074 */ 1075extern mount_t vfs_getvfs_locked(fsid_t *); 1076 1077struct mount * 1078vfs_getvfs(fsid_t *fsid) 1079{ 1080 return (mount_list_lookupby_fsid(fsid, 0, 0)); 1081} 1082 1083struct mount * 1084vfs_getvfs_locked(fsid_t *fsid) 1085{ 1086 return(mount_list_lookupby_fsid(fsid, 1, 0)); 1087} 1088 1089struct mount * 1090vfs_getvfs_by_mntonname(char *path) 1091{ 1092 mount_t retmp = (mount_t)0; 1093 mount_t mp; 1094 1095 mount_list_lock(); 1096 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 1097 if (!strncmp(mp->mnt_vfsstat.f_mntonname, path, 1098 sizeof(mp->mnt_vfsstat.f_mntonname))) { 1099 retmp = mp; 1100 goto out; 1101 } 1102 } 1103out: 1104 mount_list_unlock(); 1105 return (retmp); 1106} 1107 1108/* generation number for creation of new fsids */ 1109u_short mntid_gen = 0; 1110/* 1111 * Get a new unique fsid 1112 */ 1113void 1114vfs_getnewfsid(struct mount *mp) 1115{ 1116 1117 fsid_t tfsid; 1118 int mtype; 1119 mount_t nmp; 1120 1121 mount_list_lock(); 1122 1123 /* generate a new fsid */ 1124 mtype = mp->mnt_vtable->vfc_typenum; 1125 if (++mntid_gen == 0) 1126 mntid_gen++; 1127 tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen); 1128 tfsid.val[1] = mtype; 1129 1130 TAILQ_FOREACH(nmp, &mountlist, mnt_list) { 1131 while (vfs_getvfs_locked(&tfsid)) { 1132 if (++mntid_gen == 0) 1133 mntid_gen++; 1134 tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen); 1135 } 1136 } 1137 mp->mnt_vfsstat.f_fsid.val[0] = tfsid.val[0]; 1138 mp->mnt_vfsstat.f_fsid.val[1] = tfsid.val[1]; 1139 mount_list_unlock(); 1140} 1141 1142/* 1143 * Routines having to do with the management of the vnode table. 1144 */ 1145extern int (**dead_vnodeop_p)(void *); 1146long numvnodes, freevnodes, deadvnodes; 1147 1148 1149/* 1150 * Move a vnode from one mount queue to another. 1151 */ 1152static void 1153insmntque(vnode_t vp, mount_t mp) 1154{ 1155 mount_t lmp; 1156 /* 1157 * Delete from old mount point vnode list, if on one. 1158 */ 1159 if ( (lmp = vp->v_mount) != NULL && lmp != dead_mountp) { 1160 if ((vp->v_lflag & VNAMED_MOUNT) == 0) 1161 panic("insmntque: vp not in mount vnode list"); 1162 vp->v_lflag &= ~VNAMED_MOUNT; 1163 1164 mount_lock(lmp); 1165 1166 mount_drop(lmp, 1); 1167 1168 if (vp->v_mntvnodes.tqe_next == NULL) { 1169 if (TAILQ_LAST(&lmp->mnt_vnodelist, vnodelst) == vp) 1170 TAILQ_REMOVE(&lmp->mnt_vnodelist, vp, v_mntvnodes); 1171 else if (TAILQ_LAST(&lmp->mnt_newvnodes, vnodelst) == vp) 1172 TAILQ_REMOVE(&lmp->mnt_newvnodes, vp, v_mntvnodes); 1173 else if (TAILQ_LAST(&lmp->mnt_workerqueue, vnodelst) == vp) 1174 TAILQ_REMOVE(&lmp->mnt_workerqueue, vp, v_mntvnodes); 1175 } else { 1176 vp->v_mntvnodes.tqe_next->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_prev; 1177 *vp->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_next; 1178 } 1179 vp->v_mntvnodes.tqe_next = NULL; 1180 vp->v_mntvnodes.tqe_prev = NULL; 1181 mount_unlock(lmp); 1182 return; 1183 } 1184 1185 /* 1186 * Insert into list of vnodes for the new mount point, if available. 1187 */ 1188 if ((vp->v_mount = mp) != NULL) { 1189 mount_lock(mp); 1190 if ((vp->v_mntvnodes.tqe_next != 0) && (vp->v_mntvnodes.tqe_prev != 0)) 1191 panic("vp already in mount list"); 1192 if (mp->mnt_lflag & MNT_LITER) 1193 TAILQ_INSERT_HEAD(&mp->mnt_newvnodes, vp, v_mntvnodes); 1194 else 1195 TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 1196 if (vp->v_lflag & VNAMED_MOUNT) 1197 panic("insmntque: vp already in mount vnode list"); 1198 vp->v_lflag |= VNAMED_MOUNT; 1199 mount_ref(mp, 1); 1200 mount_unlock(mp); 1201 } 1202} 1203 1204 1205/* 1206 * Create a vnode for a block device. 1207 * Used for root filesystem, argdev, and swap areas. 1208 * Also used for memory file system special devices. 1209 */ 1210int 1211bdevvp(dev_t dev, vnode_t *vpp) 1212{ 1213 vnode_t nvp; 1214 int error; 1215 struct vnode_fsparam vfsp; 1216 struct vfs_context context; 1217 1218 if (dev == NODEV) { 1219 *vpp = NULLVP; 1220 return (ENODEV); 1221 } 1222 1223 context.vc_thread = current_thread(); 1224 context.vc_ucred = FSCRED; 1225 1226 vfsp.vnfs_mp = (struct mount *)0; 1227 vfsp.vnfs_vtype = VBLK; 1228 vfsp.vnfs_str = "bdevvp"; 1229 vfsp.vnfs_dvp = NULL; 1230 vfsp.vnfs_fsnode = NULL; 1231 vfsp.vnfs_cnp = NULL; 1232 vfsp.vnfs_vops = spec_vnodeop_p; 1233 vfsp.vnfs_rdev = dev; 1234 vfsp.vnfs_filesize = 0; 1235 1236 vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE; 1237 1238 vfsp.vnfs_marksystem = 0; 1239 vfsp.vnfs_markroot = 0; 1240 1241 if ( (error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &nvp)) ) { 1242 *vpp = NULLVP; 1243 return (error); 1244 } 1245 vnode_lock_spin(nvp); 1246 nvp->v_flag |= VBDEVVP; 1247 nvp->v_tag = VT_NON; /* set this to VT_NON so during aliasing it can be replaced */ 1248 vnode_unlock(nvp); 1249 if ( (error = vnode_ref(nvp)) ) { 1250 panic("bdevvp failed: vnode_ref"); 1251 return (error); 1252 } 1253 if ( (error = VNOP_FSYNC(nvp, MNT_WAIT, &context)) ) { 1254 panic("bdevvp failed: fsync"); 1255 return (error); 1256 } 1257 if ( (error = buf_invalidateblks(nvp, BUF_WRITE_DATA, 0, 0)) ) { 1258 panic("bdevvp failed: invalidateblks"); 1259 return (error); 1260 } 1261 1262#if CONFIG_MACF 1263 /* 1264 * XXXMAC: We can't put a MAC check here, the system will 1265 * panic without this vnode. 1266 */ 1267#endif /* MAC */ 1268 1269 if ( (error = VNOP_OPEN(nvp, FREAD, &context)) ) { 1270 panic("bdevvp failed: open"); 1271 return (error); 1272 } 1273 *vpp = nvp; 1274 1275 return (0); 1276} 1277 1278/* 1279 * Check to see if the new vnode represents a special device 1280 * for which we already have a vnode (either because of 1281 * bdevvp() or because of a different vnode representing 1282 * the same block device). If such an alias exists, deallocate 1283 * the existing contents and return the aliased vnode. The 1284 * caller is responsible for filling it with its new contents. 1285 */ 1286static vnode_t 1287checkalias(struct vnode *nvp, dev_t nvp_rdev) 1288{ 1289 struct vnode *vp; 1290 struct vnode **vpp; 1291 int vid = 0; 1292 1293 vpp = &speclisth[SPECHASH(nvp_rdev)]; 1294loop: 1295 SPECHASH_LOCK(); 1296 1297 for (vp = *vpp; vp; vp = vp->v_specnext) { 1298 if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) { 1299 vid = vp->v_id; 1300 break; 1301 } 1302 } 1303 SPECHASH_UNLOCK(); 1304 1305 if (vp) { 1306 if (vnode_getwithvid(vp,vid)) { 1307 goto loop; 1308 } 1309 /* 1310 * Termination state is checked in vnode_getwithvid 1311 */ 1312 vnode_lock(vp); 1313 1314 /* 1315 * Alias, but not in use, so flush it out. 1316 */ 1317 if ((vp->v_iocount == 1) && (vp->v_usecount == 0)) { 1318 vnode_reclaim_internal(vp, 1, 1, 0); 1319 vnode_put_locked(vp); 1320 vnode_unlock(vp); 1321 goto loop; 1322 } 1323 } 1324 if (vp == NULL || vp->v_tag != VT_NON) { 1325retnullvp: 1326 MALLOC_ZONE(nvp->v_specinfo, struct specinfo *, sizeof(struct specinfo), 1327 M_SPECINFO, M_WAITOK); 1328 bzero(nvp->v_specinfo, sizeof(struct specinfo)); 1329 nvp->v_rdev = nvp_rdev; 1330 nvp->v_specflags = 0; 1331 nvp->v_speclastr = -1; 1332 1333 SPECHASH_LOCK(); 1334 nvp->v_hashchain = vpp; 1335 nvp->v_specnext = *vpp; 1336 *vpp = nvp; 1337 SPECHASH_UNLOCK(); 1338 1339 if (vp != NULLVP) { 1340 nvp->v_flag |= VALIASED; 1341 vp->v_flag |= VALIASED; 1342 vnode_put_locked(vp); 1343 vnode_unlock(vp); 1344 } 1345 return (NULLVP); 1346 } 1347 if ((vp->v_flag & (VBDEVVP | VDEVFLUSH)) != 0) 1348 return(vp); 1349 else { 1350 panic("checkalias with VT_NON vp that shouldn't: %x", (unsigned int)vp); 1351 goto retnullvp; 1352 } 1353 return (vp); 1354} 1355 1356 1357/* 1358 * Get a reference on a particular vnode and lock it if requested. 1359 * If the vnode was on the inactive list, remove it from the list. 1360 * If the vnode was on the free list, remove it from the list and 1361 * move it to inactive list as needed. 1362 * The vnode lock bit is set if the vnode is being eliminated in 1363 * vgone. The process is awakened when the transition is completed, 1364 * and an error returned to indicate that the vnode is no longer 1365 * usable (possibly having been changed to a new file system type). 1366 */ 1367static int 1368vget_internal(vnode_t vp, int vid, int vflags) 1369{ 1370 int error = 0; 1371 int vpid; 1372 1373 vnode_lock_spin(vp); 1374 1375 if (vflags & VNODE_WITHID) 1376 vpid = vid; 1377 else 1378 vpid = vp->v_id; // save off the original v_id 1379 1380 if ((vflags & VNODE_WRITEABLE) && (vp->v_writecount == 0)) 1381 /* 1382 * vnode to be returned only if it has writers opened 1383 */ 1384 error = EINVAL; 1385 else 1386 error = vnode_getiocount(vp, vpid, vflags); 1387 1388 vnode_unlock(vp); 1389 1390 return (error); 1391} 1392 1393/* 1394 * Returns: 0 Success 1395 * ENOENT No such file or directory [terminating] 1396 */ 1397int 1398vnode_ref(vnode_t vp) 1399{ 1400 1401 return (vnode_ref_ext(vp, 0)); 1402} 1403 1404/* 1405 * Returns: 0 Success 1406 * ENOENT No such file or directory [terminating] 1407 */ 1408int 1409vnode_ref_ext(vnode_t vp, int fmode) 1410{ 1411 int error = 0; 1412 1413 vnode_lock_spin(vp); 1414 1415 /* 1416 * once all the current call sites have been fixed to insure they have 1417 * taken an iocount, we can toughen this assert up and insist that the 1418 * iocount is non-zero... a non-zero usecount doesn't insure correctness 1419 */ 1420 if (vp->v_iocount <= 0 && vp->v_usecount <= 0) 1421 panic("vnode_ref_ext: vp %p has no valid reference %d, %d", vp, vp->v_iocount, vp->v_usecount); 1422 1423 /* 1424 * if you are the owner of drain/termination, can acquire usecount 1425 */ 1426 if ((vp->v_lflag & (VL_DRAIN | VL_TERMINATE | VL_DEAD))) { 1427 if (vp->v_owner != current_thread()) { 1428 error = ENOENT; 1429 goto out; 1430 } 1431 } 1432 vp->v_usecount++; 1433 1434 if (fmode & FWRITE) { 1435 if (++vp->v_writecount <= 0) 1436 panic("vnode_ref_ext: v_writecount"); 1437 } 1438 if (fmode & O_EVTONLY) { 1439 if (++vp->v_kusecount <= 0) 1440 panic("vnode_ref_ext: v_kusecount"); 1441 } 1442 if (vp->v_flag & VRAGE) { 1443 struct uthread *ut; 1444 1445 ut = get_bsdthread_info(current_thread()); 1446 1447 if ( !(current_proc()->p_lflag & P_LRAGE_VNODES) && 1448 !(ut->uu_flag & UT_RAGE_VNODES)) { 1449 /* 1450 * a 'normal' process accessed this vnode 1451 * so make sure its no longer marked 1452 * for rapid aging... also, make sure 1453 * it gets removed from the rage list... 1454 * when v_usecount drops back to 0, it 1455 * will be put back on the real free list 1456 */ 1457 vp->v_flag &= ~VRAGE; 1458 vp->v_references = 0; 1459 vnode_list_remove(vp); 1460 } 1461 } 1462out: 1463 vnode_unlock(vp); 1464 1465 return (error); 1466} 1467 1468 1469/* 1470 * put the vnode on appropriate free list. 1471 * called with vnode LOCKED 1472 */ 1473static void 1474vnode_list_add(vnode_t vp) 1475{ 1476 /* 1477 * if it is already on a list or non zero references return 1478 */ 1479 if (VONLIST(vp) || (vp->v_usecount != 0) || (vp->v_iocount != 0)) 1480 return; 1481 vnode_list_lock(); 1482 1483 if ((vp->v_flag & VRAGE) && !(vp->v_lflag & VL_DEAD)) { 1484 /* 1485 * add the new guy to the appropriate end of the RAGE list 1486 */ 1487 if ((vp->v_flag & VAGE)) 1488 TAILQ_INSERT_HEAD(&vnode_rage_list, vp, v_freelist); 1489 else 1490 TAILQ_INSERT_TAIL(&vnode_rage_list, vp, v_freelist); 1491 1492 vp->v_listflag |= VLIST_RAGE; 1493 ragevnodes++; 1494 1495 /* 1496 * reset the timestamp for the last inserted vp on the RAGE 1497 * queue to let new_vnode know that its not ok to start stealing 1498 * from this list... as long as we're actively adding to this list 1499 * we'll push out the vnodes we want to donate to the real free list 1500 * once we stop pushing, we'll let some time elapse before we start 1501 * stealing them in the new_vnode routine 1502 */ 1503 microuptime(&rage_tv); 1504 } else { 1505 /* 1506 * if VL_DEAD, insert it at head of the dead list 1507 * else insert at tail of LRU list or at head if VAGE is set 1508 */ 1509 if ( (vp->v_lflag & VL_DEAD)) { 1510 TAILQ_INSERT_HEAD(&vnode_dead_list, vp, v_freelist); 1511 vp->v_listflag |= VLIST_DEAD; 1512 deadvnodes++; 1513 } else if ((vp->v_flag & VAGE)) { 1514 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 1515 vp->v_flag &= ~VAGE; 1516 freevnodes++; 1517 } else { 1518 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1519 freevnodes++; 1520 } 1521 } 1522 vnode_list_unlock(); 1523} 1524 1525 1526/* 1527 * remove the vnode from appropriate free list. 1528 * called with vnode LOCKED and 1529 * the list lock held 1530 */ 1531static void 1532vnode_list_remove_locked(vnode_t vp) 1533{ 1534 if (VONLIST(vp)) { 1535 /* 1536 * the v_listflag field is 1537 * protected by the vnode_list_lock 1538 */ 1539 if (vp->v_listflag & VLIST_RAGE) 1540 VREMRAGE("vnode_list_remove", vp); 1541 else if (vp->v_listflag & VLIST_DEAD) 1542 VREMDEAD("vnode_list_remove", vp); 1543 else 1544 VREMFREE("vnode_list_remove", vp); 1545 } 1546} 1547 1548 1549/* 1550 * remove the vnode from appropriate free list. 1551 * called with vnode LOCKED 1552 */ 1553static void 1554vnode_list_remove(vnode_t vp) 1555{ 1556 /* 1557 * we want to avoid taking the list lock 1558 * in the case where we're not on the free 1559 * list... this will be true for most 1560 * directories and any currently in use files 1561 * 1562 * we're guaranteed that we can't go from 1563 * the not-on-list state to the on-list 1564 * state since we hold the vnode lock... 1565 * all calls to vnode_list_add are done 1566 * under the vnode lock... so we can 1567 * check for that condition (the prevelant one) 1568 * without taking the list lock 1569 */ 1570 if (VONLIST(vp)) { 1571 vnode_list_lock(); 1572 /* 1573 * however, we're not guaranteed that 1574 * we won't go from the on-list state 1575 * to the not-on-list state until we 1576 * hold the vnode_list_lock... this 1577 * is due to "new_vnode" removing vnodes 1578 * from the free list uder the list_lock 1579 * w/o the vnode lock... so we need to 1580 * check again whether we're currently 1581 * on the free list 1582 */ 1583 vnode_list_remove_locked(vp); 1584 1585 vnode_list_unlock(); 1586 } 1587} 1588 1589 1590void 1591vnode_rele(vnode_t vp) 1592{ 1593 vnode_rele_internal(vp, 0, 0, 0); 1594} 1595 1596 1597void 1598vnode_rele_ext(vnode_t vp, int fmode, int dont_reenter) 1599{ 1600 vnode_rele_internal(vp, fmode, dont_reenter, 0); 1601} 1602 1603 1604void 1605vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked) 1606{ 1607 if ( !locked) 1608 vnode_lock_spin(vp); 1609 1610 if (--vp->v_usecount < 0) 1611 panic("vnode_rele_ext: vp %p usecount -ve : %d", vp, vp->v_usecount); 1612 1613 if (fmode & FWRITE) { 1614 if (--vp->v_writecount < 0) 1615 panic("vnode_rele_ext: vp %p writecount -ve : %ld", vp, vp->v_writecount); 1616 } 1617 if (fmode & O_EVTONLY) { 1618 if (--vp->v_kusecount < 0) 1619 panic("vnode_rele_ext: vp %p kusecount -ve : %d", vp, vp->v_kusecount); 1620 } 1621 if (vp->v_kusecount > vp->v_usecount) 1622 panic("vnode_rele_ext: vp %p kusecount(%d) out of balance with usecount(%d)\n",vp, vp->v_kusecount, vp->v_usecount); 1623 if ((vp->v_iocount > 0) || (vp->v_usecount > 0)) { 1624 /* 1625 * vnode is still busy... if we're the last 1626 * usecount, mark for a future call to VNOP_INACTIVE 1627 * when the iocount finally drops to 0 1628 */ 1629 if (vp->v_usecount == 0) { 1630 vp->v_lflag |= VL_NEEDINACTIVE; 1631 vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT); 1632 } 1633 if ( !locked) 1634 vnode_unlock(vp); 1635 return; 1636 } 1637 vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT); 1638 1639 if ( (vp->v_lflag & (VL_TERMINATE | VL_DEAD)) || dont_reenter) { 1640 /* 1641 * vnode is being cleaned, or 1642 * we've requested that we don't reenter 1643 * the filesystem on this release... in 1644 * this case, we'll mark the vnode aged 1645 * if it's been marked for termination 1646 */ 1647 if (dont_reenter) { 1648 if ( !(vp->v_lflag & (VL_TERMINATE | VL_DEAD | VL_MARKTERM)) ) 1649 vp->v_lflag |= VL_NEEDINACTIVE; 1650 vp->v_flag |= VAGE; 1651 } 1652 vnode_list_add(vp); 1653 if ( !locked) 1654 vnode_unlock(vp); 1655 return; 1656 } 1657 /* 1658 * at this point both the iocount and usecount 1659 * are zero 1660 * pick up an iocount so that we can call 1661 * VNOP_INACTIVE with the vnode lock unheld 1662 */ 1663 vp->v_iocount++; 1664#ifdef JOE_DEBUG 1665 record_vp(vp, 1); 1666#endif 1667 vp->v_lflag &= ~VL_NEEDINACTIVE; 1668 vnode_unlock(vp); 1669 1670 VNOP_INACTIVE(vp, vfs_context_current()); 1671 1672 vnode_lock_spin(vp); 1673 /* 1674 * because we dropped the vnode lock to call VNOP_INACTIVE 1675 * the state of the vnode may have changed... we may have 1676 * picked up an iocount, usecount or the MARKTERM may have 1677 * been set... we need to reevaluate the reference counts 1678 * to determine if we can call vnode_reclaim_internal at 1679 * this point... if the reference counts are up, we'll pick 1680 * up the MARKTERM state when they get subsequently dropped 1681 */ 1682 if ( (vp->v_iocount == 1) && (vp->v_usecount == 0) && 1683 ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM)) { 1684 struct uthread *ut; 1685 1686 ut = get_bsdthread_info(current_thread()); 1687 1688 if (ut->uu_defer_reclaims) { 1689 vp->v_defer_reclaimlist = ut->uu_vreclaims; 1690 ut->uu_vreclaims = vp; 1691 goto defer_reclaim; 1692 } 1693 vnode_lock_convert(vp); 1694 vnode_reclaim_internal(vp, 1, 1, 0); 1695 } 1696 vnode_dropiocount(vp); 1697 vnode_list_add(vp); 1698defer_reclaim: 1699 if ( !locked) 1700 vnode_unlock(vp); 1701 return; 1702} 1703 1704/* 1705 * Remove any vnodes in the vnode table belonging to mount point mp. 1706 * 1707 * If MNT_NOFORCE is specified, there should not be any active ones, 1708 * return error if any are found (nb: this is a user error, not a 1709 * system error). If MNT_FORCE is specified, detach any active vnodes 1710 * that are found. 1711 */ 1712#if DIAGNOSTIC 1713int busyprt = 0; /* print out busy vnodes */ 1714#if 0 1715struct ctldebug debug1 = { "busyprt", &busyprt }; 1716#endif /* 0 */ 1717#endif 1718 1719int 1720vflush(struct mount *mp, struct vnode *skipvp, int flags) 1721{ 1722 struct vnode *vp; 1723 int busy = 0; 1724 int reclaimed = 0; 1725 int retval; 1726 int vid; 1727 1728 mount_lock(mp); 1729 vnode_iterate_setup(mp); 1730 /* 1731 * On regular unmounts(not forced) do a 1732 * quick check for vnodes to be in use. This 1733 * preserves the caching of vnodes. automounter 1734 * tries unmounting every so often to see whether 1735 * it is still busy or not. 1736 */ 1737 if (((flags & FORCECLOSE)==0) && ((mp->mnt_kern_flag & MNTK_UNMOUNT_PREFLIGHT) != 0)) { 1738 if (vnode_umount_preflight(mp, skipvp, flags)) { 1739 vnode_iterate_clear(mp); 1740 mount_unlock(mp); 1741 return(EBUSY); 1742 } 1743 } 1744loop: 1745 /* it is returns 0 then there is nothing to do */ 1746 retval = vnode_iterate_prepare(mp); 1747 1748 if (retval == 0) { 1749 vnode_iterate_clear(mp); 1750 mount_unlock(mp); 1751 return(retval); 1752 } 1753 1754 /* iterate over all the vnodes */ 1755 while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) { 1756 vp = TAILQ_FIRST(&mp->mnt_workerqueue); 1757 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes); 1758 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes); 1759 if ( (vp->v_mount != mp) || (vp == skipvp)) { 1760 continue; 1761 } 1762 vid = vp->v_id; 1763 mount_unlock(mp); 1764 vnode_lock(vp); 1765 1766 if ((vp->v_id != vid) || ((vp->v_lflag & (VL_DEAD | VL_TERMINATE)))) { 1767 vnode_unlock(vp); 1768 mount_lock(mp); 1769 continue; 1770 } 1771 1772 /* 1773 * If requested, skip over vnodes marked VSYSTEM. 1774 * Skip over all vnodes marked VNOFLUSH. 1775 */ 1776 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || 1777 (vp->v_flag & VNOFLUSH))) { 1778 vnode_unlock(vp); 1779 mount_lock(mp); 1780 continue; 1781 } 1782 /* 1783 * If requested, skip over vnodes marked VSWAP. 1784 */ 1785 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) { 1786 vnode_unlock(vp); 1787 mount_lock(mp); 1788 continue; 1789 } 1790 /* 1791 * If requested, skip over vnodes marked VSWAP. 1792 */ 1793 if ((flags & SKIPROOT) && (vp->v_flag & VROOT)) { 1794 vnode_unlock(vp); 1795 mount_lock(mp); 1796 continue; 1797 } 1798 /* 1799 * If WRITECLOSE is set, only flush out regular file 1800 * vnodes open for writing. 1801 */ 1802 if ((flags & WRITECLOSE) && 1803 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1804 vnode_unlock(vp); 1805 mount_lock(mp); 1806 continue; 1807 } 1808 /* 1809 * If the real usecount is 0, all we need to do is clear 1810 * out the vnode data structures and we are done. 1811 */ 1812 if (((vp->v_usecount == 0) || 1813 ((vp->v_usecount - vp->v_kusecount) == 0))) { 1814 vp->v_iocount++; /* so that drain waits for * other iocounts */ 1815#ifdef JOE_DEBUG 1816 record_vp(vp, 1); 1817#endif 1818 vnode_reclaim_internal(vp, 1, 1, 0); 1819 vnode_dropiocount(vp); 1820 vnode_list_add(vp); 1821 vnode_unlock(vp); 1822 1823 reclaimed++; 1824 mount_lock(mp); 1825 continue; 1826 } 1827 /* 1828 * If FORCECLOSE is set, forcibly close the vnode. 1829 * For block or character devices, revert to an 1830 * anonymous device. For all other files, just kill them. 1831 */ 1832 if (flags & FORCECLOSE) { 1833 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1834 vp->v_iocount++; /* so that drain waits * for other iocounts */ 1835#ifdef JOE_DEBUG 1836 record_vp(vp, 1); 1837#endif 1838 vnode_reclaim_internal(vp, 1, 1, 0); 1839 vnode_dropiocount(vp); 1840 vnode_list_add(vp); 1841 vnode_unlock(vp); 1842 } else { 1843 vclean(vp, 0); 1844 vp->v_lflag &= ~VL_DEAD; 1845 vp->v_op = spec_vnodeop_p; 1846 vp->v_flag |= VDEVFLUSH; 1847 vnode_unlock(vp); 1848 } 1849 mount_lock(mp); 1850 continue; 1851 } 1852#if DIAGNOSTIC 1853 if (busyprt) 1854 vprint("vflush: busy vnode", vp); 1855#endif 1856 vnode_unlock(vp); 1857 mount_lock(mp); 1858 busy++; 1859 } 1860 1861 /* At this point the worker queue is completed */ 1862 if (busy && ((flags & FORCECLOSE)==0) && reclaimed) { 1863 busy = 0; 1864 reclaimed = 0; 1865 (void)vnode_iterate_reloadq(mp); 1866 /* returned with mount lock held */ 1867 goto loop; 1868 } 1869 1870 /* if new vnodes were created in between retry the reclaim */ 1871 if ( vnode_iterate_reloadq(mp) != 0) { 1872 if (!(busy && ((flags & FORCECLOSE)==0))) 1873 goto loop; 1874 } 1875 vnode_iterate_clear(mp); 1876 mount_unlock(mp); 1877 1878 if (busy && ((flags & FORCECLOSE)==0)) 1879 return (EBUSY); 1880 return (0); 1881} 1882 1883long num_recycledvnodes = 0; /* long for OSAddAtomic */ 1884/* 1885 * Disassociate the underlying file system from a vnode. 1886 * The vnode lock is held on entry. 1887 */ 1888static void 1889vclean(vnode_t vp, int flags) 1890{ 1891 vfs_context_t ctx = vfs_context_current(); 1892 int active; 1893 int need_inactive; 1894 int already_terminating; 1895 int clflags = 0; 1896 1897#if NAMEDSTREAMS 1898 int is_namedstream; 1899#endif 1900 1901 /* 1902 * Check to see if the vnode is in use. 1903 * If so we have to reference it before we clean it out 1904 * so that its count cannot fall to zero and generate a 1905 * race against ourselves to recycle it. 1906 */ 1907 active = vp->v_usecount; 1908 1909 /* 1910 * just in case we missed sending a needed 1911 * VNOP_INACTIVE, we'll do it now 1912 */ 1913 need_inactive = (vp->v_lflag & VL_NEEDINACTIVE); 1914 1915 vp->v_lflag &= ~VL_NEEDINACTIVE; 1916 1917 /* 1918 * Prevent the vnode from being recycled or 1919 * brought into use while we clean it out. 1920 */ 1921 already_terminating = (vp->v_lflag & VL_TERMINATE); 1922 1923 vp->v_lflag |= VL_TERMINATE; 1924 1925 /* 1926 * remove the vnode from any mount list 1927 * it might be on... 1928 */ 1929 insmntque(vp, (struct mount *)0); 1930 1931#if NAMEDSTREAMS 1932 is_namedstream = vnode_isnamedstream(vp); 1933#endif 1934 1935 vnode_unlock(vp); 1936 1937 OSAddAtomic(1, &num_recycledvnodes); 1938 /* 1939 * purge from the name cache as early as possible... 1940 */ 1941 cache_purge(vp); 1942 1943 if (flags & DOCLOSE) 1944 clflags |= IO_NDELAY; 1945 if (flags & REVOKEALL) 1946 clflags |= IO_REVOKE; 1947 1948 if (active && (flags & DOCLOSE)) 1949 VNOP_CLOSE(vp, clflags, ctx); 1950 1951 /* 1952 * Clean out any buffers associated with the vnode. 1953 */ 1954 if (flags & DOCLOSE) { 1955#if NFSCLIENT 1956 if (vp->v_tag == VT_NFS) 1957 nfs_vinvalbuf(vp, V_SAVE, ctx, 0); 1958 else 1959#endif 1960 { 1961 VNOP_FSYNC(vp, MNT_WAIT, ctx); 1962 buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0); 1963 } 1964 if (UBCINFOEXISTS(vp)) 1965 /* 1966 * Clean the pages in VM. 1967 */ 1968 (void)ubc_sync_range(vp, (off_t)0, ubc_getsize(vp), UBC_PUSHALL); 1969 } 1970 if (active || need_inactive) 1971 VNOP_INACTIVE(vp, ctx); 1972 1973#if NAMEDSTREAMS 1974 /* Delete the shadow stream file before we reclaim its vnode */ 1975 if ((is_namedstream != 0) && 1976 (vp->v_parent != NULLVP) && 1977 (vnode_isshadow(vp))) { 1978 vnode_relenamedstream(vp->v_parent, vp, ctx); 1979 } 1980#endif 1981 1982 /* 1983 * Destroy ubc named reference 1984 * cluster_release is done on this path 1985 * along with dropping the reference on the ucred 1986 */ 1987 ubc_destroy_named(vp); 1988 1989 /* 1990 * Reclaim the vnode. 1991 */ 1992 if (VNOP_RECLAIM(vp, ctx)) 1993 panic("vclean: cannot reclaim"); 1994 1995 // make sure the name & parent ptrs get cleaned out! 1996 vnode_update_identity(vp, NULLVP, NULL, 0, 0, VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME); 1997 1998 vnode_lock(vp); 1999 2000 vp->v_mount = dead_mountp; 2001 vp->v_op = dead_vnodeop_p; 2002 vp->v_tag = VT_NON; 2003 vp->v_data = NULL; 2004 2005 vp->v_lflag |= VL_DEAD; 2006 2007 if (already_terminating == 0) { 2008 vp->v_lflag &= ~VL_TERMINATE; 2009 /* 2010 * Done with purge, notify sleepers of the grim news. 2011 */ 2012 if (vp->v_lflag & VL_TERMWANT) { 2013 vp->v_lflag &= ~VL_TERMWANT; 2014 wakeup(&vp->v_lflag); 2015 } 2016 } 2017} 2018 2019/* 2020 * Eliminate all activity associated with the requested vnode 2021 * and with all vnodes aliased to the requested vnode. 2022 */ 2023int 2024#if DIAGNOSTIC 2025vn_revoke(vnode_t vp, int flags, __unused vfs_context_t a_context) 2026#else 2027vn_revoke(vnode_t vp, __unused int flags, __unused vfs_context_t a_context) 2028#endif 2029{ 2030 struct vnode *vq; 2031 int vid; 2032 2033#if DIAGNOSTIC 2034 if ((flags & REVOKEALL) == 0) 2035 panic("vnop_revoke"); 2036#endif 2037 2038 if (vp->v_flag & VALIASED) { 2039 /* 2040 * If a vgone (or vclean) is already in progress, 2041 * wait until it is done and return. 2042 */ 2043 vnode_lock(vp); 2044 if (vp->v_lflag & VL_TERMINATE) { 2045 vnode_unlock(vp); 2046 return(ENOENT); 2047 } 2048 vnode_unlock(vp); 2049 /* 2050 * Ensure that vp will not be vgone'd while we 2051 * are eliminating its aliases. 2052 */ 2053 SPECHASH_LOCK(); 2054 while (vp->v_flag & VALIASED) { 2055 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 2056 if (vq->v_rdev != vp->v_rdev || 2057 vq->v_type != vp->v_type || vp == vq) 2058 continue; 2059 vid = vq->v_id; 2060 SPECHASH_UNLOCK(); 2061 if (vnode_getwithvid(vq,vid)){ 2062 SPECHASH_LOCK(); 2063 break; 2064 } 2065 vnode_reclaim_internal(vq, 0, 1, 0); 2066 vnode_put(vq); 2067 SPECHASH_LOCK(); 2068 break; 2069 } 2070 } 2071 SPECHASH_UNLOCK(); 2072 } 2073 vnode_reclaim_internal(vp, 0, 0, REVOKEALL); 2074 2075 return (0); 2076} 2077 2078/* 2079 * Recycle an unused vnode to the front of the free list. 2080 * Release the passed interlock if the vnode will be recycled. 2081 */ 2082int 2083vnode_recycle(struct vnode *vp) 2084{ 2085 vnode_lock(vp); 2086 2087 if (vp->v_iocount || vp->v_usecount) { 2088 vp->v_lflag |= VL_MARKTERM; 2089 vnode_unlock(vp); 2090 return(0); 2091 } 2092 vnode_reclaim_internal(vp, 1, 0, 0); 2093 2094 vnode_unlock(vp); 2095 2096 return (1); 2097} 2098 2099static int 2100vnode_reload(vnode_t vp) 2101{ 2102 vnode_lock_spin(vp); 2103 2104 if ((vp->v_iocount > 1) || vp->v_usecount) { 2105 vnode_unlock(vp); 2106 return(0); 2107 } 2108 if (vp->v_iocount <= 0) 2109 panic("vnode_reload with no iocount %d", vp->v_iocount); 2110 2111 /* mark for release when iocount is dopped */ 2112 vp->v_lflag |= VL_MARKTERM; 2113 vnode_unlock(vp); 2114 2115 return (1); 2116} 2117 2118 2119static void 2120vgone(vnode_t vp, int flags) 2121{ 2122 struct vnode *vq; 2123 struct vnode *vx; 2124 2125 /* 2126 * Clean out the filesystem specific data. 2127 * vclean also takes care of removing the 2128 * vnode from any mount list it might be on 2129 */ 2130 vclean(vp, flags | DOCLOSE); 2131 2132 /* 2133 * If special device, remove it from special device alias list 2134 * if it is on one. 2135 */ 2136 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) { 2137 SPECHASH_LOCK(); 2138 if (*vp->v_hashchain == vp) { 2139 *vp->v_hashchain = vp->v_specnext; 2140 } else { 2141 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 2142 if (vq->v_specnext != vp) 2143 continue; 2144 vq->v_specnext = vp->v_specnext; 2145 break; 2146 } 2147 if (vq == NULL) 2148 panic("missing bdev"); 2149 } 2150 if (vp->v_flag & VALIASED) { 2151 vx = NULL; 2152 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 2153 if (vq->v_rdev != vp->v_rdev || 2154 vq->v_type != vp->v_type) 2155 continue; 2156 if (vx) 2157 break; 2158 vx = vq; 2159 } 2160 if (vx == NULL) 2161 panic("missing alias"); 2162 if (vq == NULL) 2163 vx->v_flag &= ~VALIASED; 2164 vp->v_flag &= ~VALIASED; 2165 } 2166 SPECHASH_UNLOCK(); 2167 { 2168 struct specinfo *tmp = vp->v_specinfo; 2169 vp->v_specinfo = NULL; 2170 FREE_ZONE((void *)tmp, sizeof(struct specinfo), M_SPECINFO); 2171 } 2172 } 2173} 2174 2175/* 2176 * Lookup a vnode by device number. 2177 */ 2178int 2179check_mountedon(dev_t dev, enum vtype type, int *errorp) 2180{ 2181 vnode_t vp; 2182 int rc = 0; 2183 int vid; 2184 2185loop: 2186 SPECHASH_LOCK(); 2187 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 2188 if (dev != vp->v_rdev || type != vp->v_type) 2189 continue; 2190 vid = vp->v_id; 2191 SPECHASH_UNLOCK(); 2192 if (vnode_getwithvid(vp,vid)) 2193 goto loop; 2194 vnode_lock_spin(vp); 2195 if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) { 2196 vnode_unlock(vp); 2197 if ((*errorp = vfs_mountedon(vp)) != 0) 2198 rc = 1; 2199 } else 2200 vnode_unlock(vp); 2201 vnode_put(vp); 2202 return(rc); 2203 } 2204 SPECHASH_UNLOCK(); 2205 return (0); 2206} 2207 2208/* 2209 * Calculate the total number of references to a special device. 2210 */ 2211int 2212vcount(vnode_t vp) 2213{ 2214 vnode_t vq, vnext; 2215 int count; 2216 int vid; 2217 2218loop: 2219 if ((vp->v_flag & VALIASED) == 0) 2220 return (vp->v_usecount - vp->v_kusecount); 2221 count = 0; 2222 2223 SPECHASH_LOCK(); 2224 /* 2225 * Grab first vnode and its vid. 2226 */ 2227 vq = *vp->v_hashchain; 2228 vid = vq ? vq->v_id : 0; 2229 2230 SPECHASH_UNLOCK(); 2231 2232 while (vq) { 2233 /* 2234 * Attempt to get the vnode outside the SPECHASH lock. 2235 */ 2236 if (vnode_getwithvid(vq, vid)) { 2237 goto loop; 2238 } 2239 vnode_lock(vq); 2240 2241 if (vq->v_rdev == vp->v_rdev && vq->v_type == vp->v_type) { 2242 if ((vq->v_usecount == 0) && (vq->v_iocount == 1) && vq != vp) { 2243 /* 2244 * Alias, but not in use, so flush it out. 2245 */ 2246 vnode_reclaim_internal(vq, 1, 1, 0); 2247 vnode_put_locked(vq); 2248 vnode_unlock(vq); 2249 goto loop; 2250 } 2251 count += (vq->v_usecount - vq->v_kusecount); 2252 } 2253 vnode_unlock(vq); 2254 2255 SPECHASH_LOCK(); 2256 /* 2257 * must do this with the reference still held on 'vq' 2258 * so that it can't be destroyed while we're poking 2259 * through v_specnext 2260 */ 2261 vnext = vq->v_specnext; 2262 vid = vnext ? vnext->v_id : 0; 2263 2264 SPECHASH_UNLOCK(); 2265 2266 vnode_put(vq); 2267 2268 vq = vnext; 2269 } 2270 2271 return (count); 2272} 2273 2274int prtactive = 0; /* 1 => print out reclaim of active vnodes */ 2275 2276/* 2277 * Print out a description of a vnode. 2278 */ 2279#if !CONFIG_NO_PRINTF_STRINGS 2280static const char *typename[] = 2281 { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" }; 2282#endif 2283 2284void 2285vprint(const char *label, struct vnode *vp) 2286{ 2287 char sbuf[64]; 2288 2289 if (label != NULL) 2290 printf("%s: ", label); 2291 printf("type %s, usecount %d, writecount %ld", 2292 typename[vp->v_type], vp->v_usecount, vp->v_writecount); 2293 sbuf[0] = '\0'; 2294 if (vp->v_flag & VROOT) 2295 strlcat(sbuf, "|VROOT", sizeof(sbuf)); 2296 if (vp->v_flag & VTEXT) 2297 strlcat(sbuf, "|VTEXT", sizeof(sbuf)); 2298 if (vp->v_flag & VSYSTEM) 2299 strlcat(sbuf, "|VSYSTEM", sizeof(sbuf)); 2300 if (vp->v_flag & VNOFLUSH) 2301 strlcat(sbuf, "|VNOFLUSH", sizeof(sbuf)); 2302 if (vp->v_flag & VBWAIT) 2303 strlcat(sbuf, "|VBWAIT", sizeof(sbuf)); 2304 if (vp->v_flag & VALIASED) 2305 strlcat(sbuf, "|VALIASED", sizeof(sbuf)); 2306 if (sbuf[0] != '\0') 2307 printf(" flags (%s)", &sbuf[1]); 2308} 2309 2310 2311int 2312vn_getpath(struct vnode *vp, char *pathbuf, int *len) 2313{ 2314 return build_path(vp, pathbuf, *len, len, BUILDPATH_NO_FS_ENTER, vfs_context_current()); 2315} 2316 2317 2318int 2319vn_getcdhash(struct vnode *vp, off_t offset, unsigned char *cdhash) 2320{ 2321 return ubc_cs_getcdhash(vp, offset, cdhash); 2322} 2323 2324 2325static char *extension_table=NULL; 2326static int nexts; 2327static int max_ext_width; 2328 2329static int 2330extension_cmp(const void *a, const void *b) 2331{ 2332 return (strlen((const char *)a) - strlen((const char *)b)); 2333} 2334 2335 2336// 2337// This is the api LaunchServices uses to inform the kernel 2338// the list of package extensions to ignore. 2339// 2340// Internally we keep the list sorted by the length of the 2341// the extension (from longest to shortest). We sort the 2342// list of extensions so that we can speed up our searches 2343// when comparing file names -- we only compare extensions 2344// that could possibly fit into the file name, not all of 2345// them (i.e. a short 8 character name can't have an 8 2346// character extension). 2347// 2348__private_extern__ int 2349set_package_extensions_table(void *data, int nentries, int maxwidth) 2350{ 2351 char *new_exts; 2352 int error; 2353 2354 if (nentries <= 0 || nentries > 1024 || maxwidth <= 0 || maxwidth > 255) { 2355 return EINVAL; 2356 } 2357 2358 MALLOC(new_exts, char *, nentries * maxwidth, M_TEMP, M_WAITOK); 2359 2360 error = copyin(CAST_USER_ADDR_T(data), new_exts, nentries * maxwidth); 2361 if (error) { 2362 FREE(new_exts, M_TEMP); 2363 return error; 2364 } 2365 2366 if (extension_table) { 2367 FREE(extension_table, M_TEMP); 2368 } 2369 extension_table = new_exts; 2370 nexts = nentries; 2371 max_ext_width = maxwidth; 2372 2373 qsort(extension_table, nexts, maxwidth, extension_cmp); 2374 2375 return 0; 2376} 2377 2378 2379__private_extern__ int 2380is_package_name(const char *name, int len) 2381{ 2382 int i, extlen; 2383 const char *ptr, *name_ext; 2384 2385 if (len <= 3) { 2386 return 0; 2387 } 2388 2389 name_ext = NULL; 2390 for(ptr=name; *ptr != '\0'; ptr++) { 2391 if (*ptr == '.') { 2392 name_ext = ptr; 2393 } 2394 } 2395 2396 // if there is no "." extension, it can't match 2397 if (name_ext == NULL) { 2398 return 0; 2399 } 2400 2401 // advance over the "." 2402 name_ext++; 2403 2404 // now iterate over all the extensions to see if any match 2405 ptr = &extension_table[0]; 2406 for(i=0; i < nexts; i++, ptr+=max_ext_width) { 2407 extlen = strlen(ptr); 2408 if (strncasecmp(name_ext, ptr, extlen) == 0 && name_ext[extlen] == '\0') { 2409 // aha, a match! 2410 return 1; 2411 } 2412 } 2413 2414 // if we get here, no extension matched 2415 return 0; 2416} 2417 2418int 2419vn_path_package_check(__unused vnode_t vp, char *path, int pathlen, int *component) 2420{ 2421 char *ptr, *end; 2422 int comp=0; 2423 2424 *component = -1; 2425 if (*path != '/') { 2426 return EINVAL; 2427 } 2428 2429 end = path + 1; 2430 while(end < path + pathlen && *end != '\0') { 2431 while(end < path + pathlen && *end == '/' && *end != '\0') { 2432 end++; 2433 } 2434 2435 ptr = end; 2436 2437 while(end < path + pathlen && *end != '/' && *end != '\0') { 2438 end++; 2439 } 2440 2441 if (end > path + pathlen) { 2442 // hmm, string wasn't null terminated 2443 return EINVAL; 2444 } 2445 2446 *end = '\0'; 2447 if (is_package_name(ptr, end - ptr)) { 2448 *component = comp; 2449 break; 2450 } 2451 2452 end++; 2453 comp++; 2454 } 2455 2456 return 0; 2457} 2458 2459 2460/* 2461 * Top level filesystem related information gathering. 2462 */ 2463extern unsigned int vfs_nummntops; 2464 2465int 2466vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, 2467 user_addr_t newp, size_t newlen, proc_t p) 2468{ 2469 struct vfstable *vfsp; 2470 int *username; 2471 u_int usernamelen; 2472 int error; 2473 struct vfsconf *vfsc; 2474 2475 /* All non VFS_GENERIC and in VFS_GENERIC, 2476 * VFS_MAXTYPENUM, VFS_CONF, VFS_SET_PACKAGE_EXTS 2477 * needs to have root priv to have modifiers. 2478 * For rest the userland_sysctl(CTLFLAG_ANYBODY) would cover. 2479 */ 2480 if ((newp != USER_ADDR_NULL) && ((name[0] != VFS_GENERIC) || 2481 ((name[1] == VFS_MAXTYPENUM) || 2482 (name[1] == VFS_CONF) || 2483 (name[1] == VFS_SET_PACKAGE_EXTS))) 2484 && (error = suser(kauth_cred_get(), &p->p_acflag))) { 2485 return(error); 2486 } 2487 /* 2488 * The VFS_NUMMNTOPS shouldn't be at name[0] since 2489 * is a VFS generic variable. So now we must check 2490 * namelen so we don't end up covering any UFS 2491 * variables (sinc UFS vfc_typenum is 1). 2492 * 2493 * It should have been: 2494 * name[0]: VFS_GENERIC 2495 * name[1]: VFS_NUMMNTOPS 2496 */ 2497 if (namelen == 1 && name[0] == VFS_NUMMNTOPS) { 2498 return (sysctl_rdint(oldp, oldlenp, newp, vfs_nummntops)); 2499 } 2500 2501 /* all sysctl names at this level are at least name and field */ 2502 if (namelen < 2) 2503 return (EISDIR); /* overloaded */ 2504 if (name[0] != VFS_GENERIC) { 2505 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2506 if (vfsp->vfc_typenum == name[0]) 2507 break; 2508 if (vfsp == NULL) 2509 return (ENOTSUP); 2510 2511 /* XXX current context proxy for proc p? */ 2512 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 2513 oldp, oldlenp, newp, newlen, 2514 vfs_context_current())); 2515 } 2516 switch (name[1]) { 2517 case VFS_MAXTYPENUM: 2518 return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf)); 2519 case VFS_CONF: 2520 if (namelen < 3) 2521 return (ENOTDIR); /* overloaded */ 2522 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2523 if (vfsp->vfc_typenum == name[2]) 2524 break; 2525 if (vfsp == NULL) 2526 return (ENOTSUP); 2527 vfsc = (struct vfsconf *)vfsp; 2528 if (proc_is64bit(p)) { 2529 struct user_vfsconf usr_vfsc; 2530 usr_vfsc.vfc_vfsops = CAST_USER_ADDR_T(vfsc->vfc_vfsops); 2531 bcopy(vfsc->vfc_name, usr_vfsc.vfc_name, sizeof(usr_vfsc.vfc_name)); 2532 usr_vfsc.vfc_typenum = vfsc->vfc_typenum; 2533 usr_vfsc.vfc_refcount = vfsc->vfc_refcount; 2534 usr_vfsc.vfc_flags = vfsc->vfc_flags; 2535 usr_vfsc.vfc_mountroot = CAST_USER_ADDR_T(vfsc->vfc_mountroot); 2536 usr_vfsc.vfc_next = CAST_USER_ADDR_T(vfsc->vfc_next); 2537 return (sysctl_rdstruct(oldp, oldlenp, newp, &usr_vfsc, 2538 sizeof(usr_vfsc))); 2539 } 2540 else { 2541 return (sysctl_rdstruct(oldp, oldlenp, newp, vfsc, 2542 sizeof(struct vfsconf))); 2543 } 2544 2545 case VFS_SET_PACKAGE_EXTS: 2546 return set_package_extensions_table((void *)name[1], name[2], name[3]); 2547 } 2548 /* 2549 * We need to get back into the general MIB, so we need to re-prepend 2550 * CTL_VFS to our name and try userland_sysctl(). 2551 */ 2552 usernamelen = namelen + 1; 2553 MALLOC(username, int *, usernamelen * sizeof(*username), 2554 M_TEMP, M_WAITOK); 2555 bcopy(name, username + 1, namelen * sizeof(*name)); 2556 username[0] = CTL_VFS; 2557 error = userland_sysctl(p, username, usernamelen, oldp, 2558 oldlenp, 1, newp, newlen, oldlenp); 2559 FREE(username, M_TEMP); 2560 return (error); 2561} 2562 2563/* 2564 * Dump vnode list (via sysctl) - defunct 2565 * use "pstat" instead 2566 */ 2567/* ARGSUSED */ 2568int 2569sysctl_vnode 2570(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, __unused struct sysctl_req *req) 2571{ 2572 return(EINVAL); 2573} 2574 2575SYSCTL_PROC(_kern, KERN_VNODE, vnode, 2576 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MASKED, 2577 0, 0, sysctl_vnode, "S,", ""); 2578 2579 2580/* 2581 * Check to see if a filesystem is mounted on a block device. 2582 */ 2583int 2584vfs_mountedon(struct vnode *vp) 2585{ 2586 struct vnode *vq; 2587 int error = 0; 2588 2589 SPECHASH_LOCK(); 2590 if (vp->v_specflags & SI_MOUNTEDON) { 2591 error = EBUSY; 2592 goto out; 2593 } 2594 if (vp->v_flag & VALIASED) { 2595 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 2596 if (vq->v_rdev != vp->v_rdev || 2597 vq->v_type != vp->v_type) 2598 continue; 2599 if (vq->v_specflags & SI_MOUNTEDON) { 2600 error = EBUSY; 2601 break; 2602 } 2603 } 2604 } 2605out: 2606 SPECHASH_UNLOCK(); 2607 return (error); 2608} 2609 2610/* 2611 * Unmount all filesystems. The list is traversed in reverse order 2612 * of mounting to avoid dependencies. 2613 */ 2614__private_extern__ void 2615vfs_unmountall(void) 2616{ 2617 struct mount *mp; 2618 int error; 2619 2620 /* 2621 * Since this only runs when rebooting, it is not interlocked. 2622 */ 2623 mount_list_lock(); 2624 while(!TAILQ_EMPTY(&mountlist)) { 2625 mp = TAILQ_LAST(&mountlist, mntlist); 2626 mount_list_unlock(); 2627 error = dounmount(mp, MNT_FORCE, 0, vfs_context_current()); 2628 if ((error != 0) && (error != EBUSY)) { 2629 printf("unmount of %s failed (", mp->mnt_vfsstat.f_mntonname); 2630 printf("%d)\n", error); 2631 mount_list_lock(); 2632 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2633 continue; 2634 } else if (error == EBUSY) { 2635 /* If EBUSY is returned, the unmount was already in progress */ 2636 printf("unmount of %x failed (", (unsigned int)mp); 2637 printf("BUSY)\n"); 2638 } 2639 mount_list_lock(); 2640 } 2641 mount_list_unlock(); 2642} 2643 2644 2645/* 2646 * This routine is called from vnode_pager_deallocate out of the VM 2647 * The path to vnode_pager_deallocate can only be initiated by ubc_destroy_named 2648 * on a vnode that has a UBCINFO 2649 */ 2650__private_extern__ void 2651vnode_pager_vrele(vnode_t vp) 2652{ 2653 struct ubc_info *uip; 2654 2655 vnode_lock(vp); 2656 2657 vp->v_lflag &= ~VNAMED_UBC; 2658 2659 uip = vp->v_ubcinfo; 2660 vp->v_ubcinfo = UBC_INFO_NULL; 2661 2662 ubc_info_deallocate(uip); 2663 2664 vnode_unlock(vp); 2665} 2666 2667 2668#include <sys/disk.h> 2669 2670errno_t 2671vfs_init_io_attributes(vnode_t devvp, mount_t mp) 2672{ 2673 int error; 2674 off_t readblockcnt; 2675 off_t writeblockcnt; 2676 off_t readmaxcnt; 2677 off_t writemaxcnt; 2678 off_t readsegcnt; 2679 off_t writesegcnt; 2680 off_t readsegsize; 2681 off_t writesegsize; 2682 off_t alignment; 2683 u_long blksize; 2684 u_int64_t temp; 2685 u_int32_t features; 2686 vfs_context_t ctx = vfs_context_current(); 2687 2688 int isvirtual = 0; 2689 /* 2690 * determine if this mount point exists on the same device as the root 2691 * partition... if so, then it comes under the hard throttle control 2692 */ 2693 int thisunit = -1; 2694 static int rootunit = -1; 2695 2696 if (rootunit == -1) { 2697 if (VNOP_IOCTL(rootvp, DKIOCGETBSDUNIT, (caddr_t)&rootunit, 0, ctx)) 2698 rootunit = -1; 2699 else if (rootvp == devvp) 2700 mp->mnt_kern_flag |= MNTK_ROOTDEV; 2701 } 2702 if (devvp != rootvp && rootunit != -1) { 2703 if (VNOP_IOCTL(devvp, DKIOCGETBSDUNIT, (caddr_t)&thisunit, 0, ctx) == 0) { 2704 if (thisunit == rootunit) 2705 mp->mnt_kern_flag |= MNTK_ROOTDEV; 2706 } 2707 } 2708 /* 2709 * force the spec device to re-cache 2710 * the underlying block size in case 2711 * the filesystem overrode the initial value 2712 */ 2713 set_fsblocksize(devvp); 2714 2715 2716 if ((error = VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, 2717 (caddr_t)&blksize, 0, ctx))) 2718 return (error); 2719 2720 mp->mnt_devblocksize = blksize; 2721 2722 if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, ctx) == 0) { 2723 if (isvirtual) 2724 mp->mnt_kern_flag |= MNTK_VIRTUALDEV; 2725 } 2726 2727 if ((error = VNOP_IOCTL(devvp, DKIOCGETFEATURES, 2728 (caddr_t)&features, 0, ctx))) 2729 return (error); 2730 2731 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD, 2732 (caddr_t)&readblockcnt, 0, ctx))) 2733 return (error); 2734 2735 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE, 2736 (caddr_t)&writeblockcnt, 0, ctx))) 2737 return (error); 2738 2739 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD, 2740 (caddr_t)&readmaxcnt, 0, ctx))) 2741 return (error); 2742 2743 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE, 2744 (caddr_t)&writemaxcnt, 0, ctx))) 2745 return (error); 2746 2747 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD, 2748 (caddr_t)&readsegcnt, 0, ctx))) 2749 return (error); 2750 2751 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE, 2752 (caddr_t)&writesegcnt, 0, ctx))) 2753 return (error); 2754 2755 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD, 2756 (caddr_t)&readsegsize, 0, ctx))) 2757 return (error); 2758 2759 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE, 2760 (caddr_t)&writesegsize, 0, ctx))) 2761 return (error); 2762 2763 if ((error = VNOP_IOCTL(devvp, DKIOCGETMINSEGMENTALIGNMENTBYTECOUNT, 2764 (caddr_t)&alignment, 0, ctx))) 2765 return (error); 2766 2767 if (readmaxcnt) 2768 temp = (readmaxcnt > UINT32_MAX) ? UINT32_MAX : readmaxcnt; 2769 else { 2770 if (readblockcnt) { 2771 temp = readblockcnt * blksize; 2772 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp; 2773 } else 2774 temp = MAXPHYS; 2775 } 2776 mp->mnt_maxreadcnt = (u_int32_t)temp; 2777 2778 if (writemaxcnt) 2779 temp = (writemaxcnt > UINT32_MAX) ? UINT32_MAX : writemaxcnt; 2780 else { 2781 if (writeblockcnt) { 2782 temp = writeblockcnt * blksize; 2783 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp; 2784 } else 2785 temp = MAXPHYS; 2786 } 2787 mp->mnt_maxwritecnt = (u_int32_t)temp; 2788 2789 if (readsegcnt) { 2790 temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt; 2791 mp->mnt_segreadcnt = (u_int16_t)temp; 2792 } 2793 if (writesegcnt) { 2794 temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt; 2795 mp->mnt_segwritecnt = (u_int16_t)temp; 2796 } 2797 if (readsegsize) 2798 temp = (readsegsize > UINT32_MAX) ? UINT32_MAX : readsegsize; 2799 else 2800 temp = mp->mnt_maxreadcnt; 2801 mp->mnt_maxsegreadsize = (u_int32_t)temp; 2802 2803 if (writesegsize) 2804 temp = (writesegsize > UINT32_MAX) ? UINT32_MAX : writesegsize; 2805 else 2806 temp = mp->mnt_maxwritecnt; 2807 mp->mnt_maxsegwritesize = (u_int32_t)temp; 2808 2809 if (alignment) 2810 temp = (alignment > PAGE_SIZE) ? PAGE_MASK : alignment - 1; 2811 else 2812 temp = 0; 2813 mp->mnt_alignmentmask = temp; 2814 2815 if (features & DK_FEATURE_FORCE_UNIT_ACCESS) 2816 mp->mnt_ioflags |= MNT_IOFLAGS_FUA_SUPPORTED; 2817 2818 return (error); 2819} 2820 2821static struct klist fs_klist; 2822lck_grp_t *fs_klist_lck_grp; 2823lck_mtx_t *fs_klist_lock; 2824 2825void 2826vfs_event_init(void) 2827{ 2828 klist_init(&fs_klist); 2829 fs_klist_lck_grp = lck_grp_alloc_init("fs_klist", NULL); 2830 fs_klist_lock = lck_mtx_alloc_init(fs_klist_lck_grp, NULL); 2831} 2832 2833void 2834vfs_event_signal(__unused fsid_t *fsid, u_int32_t event, __unused intptr_t data) 2835{ 2836 lck_mtx_lock(fs_klist_lock); 2837 KNOTE(&fs_klist, event); 2838 lck_mtx_unlock(fs_klist_lock); 2839} 2840 2841/* 2842 * return the number of mounted filesystems. 2843 */ 2844static int 2845sysctl_vfs_getvfscnt(void) 2846{ 2847 return(mount_getvfscnt()); 2848} 2849 2850 2851static int 2852mount_getvfscnt(void) 2853{ 2854 int ret; 2855 2856 mount_list_lock(); 2857 ret = nummounts; 2858 mount_list_unlock(); 2859 return (ret); 2860 2861} 2862 2863 2864 2865static int 2866mount_fillfsids(fsid_t *fsidlst, int count) 2867{ 2868 struct mount *mp; 2869 int actual=0; 2870 2871 actual = 0; 2872 mount_list_lock(); 2873 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 2874 if (actual <= count) { 2875 fsidlst[actual] = mp->mnt_vfsstat.f_fsid; 2876 actual++; 2877 } 2878 } 2879 mount_list_unlock(); 2880 return (actual); 2881 2882} 2883 2884/* 2885 * fill in the array of fsid_t's up to a max of 'count', the actual 2886 * number filled in will be set in '*actual'. If there are more fsid_t's 2887 * than room in fsidlst then ENOMEM will be returned and '*actual' will 2888 * have the actual count. 2889 * having *actual filled out even in the error case is depended upon. 2890 */ 2891static int 2892sysctl_vfs_getvfslist(fsid_t *fsidlst, int count, int *actual) 2893{ 2894 struct mount *mp; 2895 2896 *actual = 0; 2897 mount_list_lock(); 2898 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 2899 (*actual)++; 2900 if (*actual <= count) 2901 fsidlst[(*actual) - 1] = mp->mnt_vfsstat.f_fsid; 2902 } 2903 mount_list_unlock(); 2904 return (*actual <= count ? 0 : ENOMEM); 2905} 2906 2907static int 2908sysctl_vfs_vfslist(__unused struct sysctl_oid *oidp, __unused void *arg1, 2909 __unused int arg2, struct sysctl_req *req) 2910{ 2911 int actual, error; 2912 size_t space; 2913 fsid_t *fsidlst; 2914 2915 /* This is a readonly node. */ 2916 if (req->newptr != USER_ADDR_NULL) 2917 return (EPERM); 2918 2919 /* they are querying us so just return the space required. */ 2920 if (req->oldptr == USER_ADDR_NULL) { 2921 req->oldidx = sysctl_vfs_getvfscnt() * sizeof(fsid_t); 2922 return 0; 2923 } 2924again: 2925 /* 2926 * Retrieve an accurate count of the amount of space required to copy 2927 * out all the fsids in the system. 2928 */ 2929 space = req->oldlen; 2930 req->oldlen = sysctl_vfs_getvfscnt() * sizeof(fsid_t); 2931 2932 /* they didn't give us enough space. */ 2933 if (space < req->oldlen) 2934 return (ENOMEM); 2935 2936 MALLOC(fsidlst, fsid_t *, req->oldlen, M_TEMP, M_WAITOK); 2937 error = sysctl_vfs_getvfslist(fsidlst, req->oldlen / sizeof(fsid_t), 2938 &actual); 2939 /* 2940 * If we get back ENOMEM, then another mount has been added while we 2941 * slept in malloc above. If this is the case then try again. 2942 */ 2943 if (error == ENOMEM) { 2944 FREE(fsidlst, M_TEMP); 2945 req->oldlen = space; 2946 goto again; 2947 } 2948 if (error == 0) { 2949 error = SYSCTL_OUT(req, fsidlst, actual * sizeof(fsid_t)); 2950 } 2951 FREE(fsidlst, M_TEMP); 2952 return (error); 2953} 2954 2955/* 2956 * Do a sysctl by fsid. 2957 */ 2958static int 2959sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid *oidp, void *arg1, int arg2, 2960 struct sysctl_req *req) 2961{ 2962 struct vfsidctl vc; 2963 struct user_vfsidctl user_vc; 2964 struct mount *mp; 2965 struct vfsstatfs *sp; 2966 int *name, flags, namelen; 2967 int error=0, gotref=0; 2968 vfs_context_t ctx = vfs_context_current(); 2969 proc_t p = req->p; /* XXX req->p != current_proc()? */ 2970 boolean_t is_64_bit; 2971 2972 name = arg1; 2973 namelen = arg2; 2974 is_64_bit = proc_is64bit(p); 2975 2976 if (is_64_bit) { 2977 error = SYSCTL_IN(req, &user_vc, sizeof(user_vc)); 2978 if (error) 2979 goto out; 2980 if (user_vc.vc_vers != VFS_CTL_VERS1) { 2981 error = EINVAL; 2982 goto out; 2983 } 2984 mp = mount_list_lookupby_fsid(&user_vc.vc_fsid, 0, 1); 2985 } 2986 else { 2987 error = SYSCTL_IN(req, &vc, sizeof(vc)); 2988 if (error) 2989 goto out; 2990 if (vc.vc_vers != VFS_CTL_VERS1) { 2991 error = EINVAL; 2992 goto out; 2993 } 2994 mp = mount_list_lookupby_fsid(&vc.vc_fsid, 0, 1); 2995 } 2996 if (mp == NULL) { 2997 error = ENOENT; 2998 goto out; 2999 } 3000 gotref = 1; 3001 /* reset so that the fs specific code can fetch it. */ 3002 req->newidx = 0; 3003 /* 3004 * Note if this is a VFS_CTL then we pass the actual sysctl req 3005 * in for "oldp" so that the lower layer can DTRT and use the 3006 * SYSCTL_IN/OUT routines. 3007 */ 3008 if (mp->mnt_op->vfs_sysctl != NULL) { 3009 if (is_64_bit) { 3010 if (vfs_64bitready(mp)) { 3011 error = mp->mnt_op->vfs_sysctl(name, namelen, 3012 CAST_USER_ADDR_T(req), 3013 NULL, USER_ADDR_NULL, 0, 3014 ctx); 3015 } 3016 else { 3017 error = ENOTSUP; 3018 } 3019 } 3020 else { 3021 error = mp->mnt_op->vfs_sysctl(name, namelen, 3022 CAST_USER_ADDR_T(req), 3023 NULL, USER_ADDR_NULL, 0, 3024 ctx); 3025 } 3026 if (error != ENOTSUP) { 3027 goto out; 3028 } 3029 } 3030 switch (name[0]) { 3031 case VFS_CTL_UMOUNT: 3032 req->newidx = 0; 3033 if (is_64_bit) { 3034 req->newptr = user_vc.vc_ptr; 3035 req->newlen = (size_t)user_vc.vc_len; 3036 } 3037 else { 3038 req->newptr = CAST_USER_ADDR_T(vc.vc_ptr); 3039 req->newlen = vc.vc_len; 3040 } 3041 error = SYSCTL_IN(req, &flags, sizeof(flags)); 3042 if (error) 3043 break; 3044 3045 mount_ref(mp, 0); 3046 mount_iterdrop(mp); 3047 gotref = 0; 3048 /* safedounmount consumes a ref */ 3049 error = safedounmount(mp, flags, ctx); 3050 break; 3051 case VFS_CTL_STATFS: 3052 req->newidx = 0; 3053 if (is_64_bit) { 3054 req->newptr = user_vc.vc_ptr; 3055 req->newlen = (size_t)user_vc.vc_len; 3056 } 3057 else { 3058 req->newptr = CAST_USER_ADDR_T(vc.vc_ptr); 3059 req->newlen = vc.vc_len; 3060 } 3061 error = SYSCTL_IN(req, &flags, sizeof(flags)); 3062 if (error) 3063 break; 3064 sp = &mp->mnt_vfsstat; 3065 if (((flags & MNT_NOWAIT) == 0 || (flags & MNT_WAIT)) && 3066 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT))) 3067 goto out; 3068 if (is_64_bit) { 3069 struct user_statfs sfs; 3070 bzero(&sfs, sizeof(sfs)); 3071 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK; 3072 sfs.f_type = mp->mnt_vtable->vfc_typenum; 3073 sfs.f_bsize = (user_long_t)sp->f_bsize; 3074 sfs.f_iosize = (user_long_t)sp->f_iosize; 3075 sfs.f_blocks = (user_long_t)sp->f_blocks; 3076 sfs.f_bfree = (user_long_t)sp->f_bfree; 3077 sfs.f_bavail = (user_long_t)sp->f_bavail; 3078 sfs.f_files = (user_long_t)sp->f_files; 3079 sfs.f_ffree = (user_long_t)sp->f_ffree; 3080 sfs.f_fsid = sp->f_fsid; 3081 sfs.f_owner = sp->f_owner; 3082 3083 strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN); 3084 strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN); 3085 strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN); 3086 3087 error = SYSCTL_OUT(req, &sfs, sizeof(sfs)); 3088 } 3089 else { 3090 struct statfs sfs; 3091 bzero(&sfs, sizeof(struct statfs)); 3092 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK; 3093 sfs.f_type = mp->mnt_vtable->vfc_typenum; 3094 3095 /* 3096 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we 3097 * have to fudge the numbers here in that case. We inflate the blocksize in order 3098 * to reflect the filesystem size as best we can. 3099 */ 3100 if (sp->f_blocks > LONG_MAX) { 3101 int shift; 3102 3103 /* 3104 * Work out how far we have to shift the block count down to make it fit. 3105 * Note that it's possible to have to shift so far that the resulting 3106 * blocksize would be unreportably large. At that point, we will clip 3107 * any values that don't fit. 3108 * 3109 * For safety's sake, we also ensure that f_iosize is never reported as 3110 * being smaller than f_bsize. 3111 */ 3112 for (shift = 0; shift < 32; shift++) { 3113 if ((sp->f_blocks >> shift) <= LONG_MAX) 3114 break; 3115 if ((sp->f_bsize << (shift + 1)) > LONG_MAX) 3116 break; 3117 } 3118#define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > LONG_MAX) ? LONG_MAX : ((x) >> (s))) 3119 sfs.f_blocks = (long)__SHIFT_OR_CLIP(sp->f_blocks, shift); 3120 sfs.f_bfree = (long)__SHIFT_OR_CLIP(sp->f_bfree, shift); 3121 sfs.f_bavail = (long)__SHIFT_OR_CLIP(sp->f_bavail, shift); 3122#undef __SHIFT_OR_CLIP 3123 sfs.f_bsize = (long)(sp->f_bsize << shift); 3124 sfs.f_iosize = lmax(sp->f_iosize, sp->f_bsize); 3125 } else { 3126 sfs.f_bsize = (long)sp->f_bsize; 3127 sfs.f_iosize = (long)sp->f_iosize; 3128 sfs.f_blocks = (long)sp->f_blocks; 3129 sfs.f_bfree = (long)sp->f_bfree; 3130 sfs.f_bavail = (long)sp->f_bavail; 3131 } 3132 sfs.f_files = (long)sp->f_files; 3133 sfs.f_ffree = (long)sp->f_ffree; 3134 sfs.f_fsid = sp->f_fsid; 3135 sfs.f_owner = sp->f_owner; 3136 3137 strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN); 3138 strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN); 3139 strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN); 3140 3141 error = SYSCTL_OUT(req, &sfs, sizeof(sfs)); 3142 } 3143 break; 3144 default: 3145 error = ENOTSUP; 3146 goto out; 3147 } 3148out: 3149 if(gotref != 0) 3150 mount_iterdrop(mp); 3151 return (error); 3152} 3153 3154static int filt_fsattach(struct knote *kn); 3155static void filt_fsdetach(struct knote *kn); 3156static int filt_fsevent(struct knote *kn, long hint); 3157 3158struct filterops fs_filtops = 3159 { 0, filt_fsattach, filt_fsdetach, filt_fsevent }; 3160 3161static int 3162filt_fsattach(struct knote *kn) 3163{ 3164 3165 lck_mtx_lock(fs_klist_lock); 3166 kn->kn_flags |= EV_CLEAR; 3167 KNOTE_ATTACH(&fs_klist, kn); 3168 lck_mtx_unlock(fs_klist_lock); 3169 return (0); 3170} 3171 3172static void 3173filt_fsdetach(struct knote *kn) 3174{ 3175 lck_mtx_lock(fs_klist_lock); 3176 KNOTE_DETACH(&fs_klist, kn); 3177 lck_mtx_unlock(fs_klist_lock); 3178} 3179 3180static int 3181filt_fsevent(struct knote *kn, long hint) 3182{ 3183 /* 3184 * Backwards compatibility: 3185 * Other filters would do nothing if kn->kn_sfflags == 0 3186 */ 3187 3188 if ((kn->kn_sfflags == 0) || (kn->kn_sfflags & hint)) { 3189 kn->kn_fflags |= hint; 3190 } 3191 3192 return (kn->kn_fflags != 0); 3193} 3194 3195static int 3196sysctl_vfs_noremotehang(__unused struct sysctl_oid *oidp, 3197 __unused void *arg1, __unused int arg2, struct sysctl_req *req) 3198{ 3199 int out, error; 3200 pid_t pid; 3201 proc_t p; 3202 3203 /* We need a pid. */ 3204 if (req->newptr == USER_ADDR_NULL) 3205 return (EINVAL); 3206 3207 error = SYSCTL_IN(req, &pid, sizeof(pid)); 3208 if (error) 3209 return (error); 3210 3211 p = proc_find(pid < 0 ? -pid : pid); 3212 if (p == NULL) 3213 return (ESRCH); 3214 3215 /* 3216 * Fetching the value is ok, but we only fetch if the old 3217 * pointer is given. 3218 */ 3219 if (req->oldptr != USER_ADDR_NULL) { 3220 out = !((p->p_flag & P_NOREMOTEHANG) == 0); 3221 proc_rele(p); 3222 error = SYSCTL_OUT(req, &out, sizeof(out)); 3223 return (error); 3224 } 3225 3226 /* cansignal offers us enough security. */ 3227 if (p != req->p && proc_suser(req->p) != 0) { 3228 proc_rele(p); 3229 return (EPERM); 3230 } 3231 3232 if (pid < 0) 3233 OSBitAndAtomic(~((uint32_t)P_NOREMOTEHANG), (UInt32 *)&p->p_flag); 3234 else 3235 OSBitOrAtomic(P_NOREMOTEHANG, (UInt32 *)&p->p_flag); 3236 proc_rele(p); 3237 3238 return (0); 3239} 3240 3241/* the vfs.generic. branch. */ 3242SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW|CTLFLAG_LOCKED, NULL, "vfs generic hinge"); 3243/* retreive a list of mounted filesystem fsid_t */ 3244SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist, CTLFLAG_RD, 3245 NULL, 0, sysctl_vfs_vfslist, "S,fsid", "List of mounted filesystem ids"); 3246/* perform operations on filesystem via fsid_t */ 3247SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW|CTLFLAG_LOCKED, 3248 sysctl_vfs_ctlbyfsid, "ctlbyfsid"); 3249SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW|CTLFLAG_ANYBODY, 3250 NULL, 0, sysctl_vfs_noremotehang, "I", "noremotehang"); 3251 3252 3253long num_reusedvnodes = 0; /* long for OSAddAtomic */ 3254 3255static int 3256new_vnode(vnode_t *vpp) 3257{ 3258 vnode_t vp; 3259 int retries = 0; /* retry incase of tablefull */ 3260 int force_alloc = 0, walk_count = 0; 3261 int vpid; 3262 struct timespec ts; 3263 struct timeval current_tv; 3264 struct unsafe_fsnode *l_unsafefs = 0; 3265 proc_t curproc = current_proc(); 3266 3267retry: 3268 microuptime(¤t_tv); 3269 3270 vp = NULLVP; 3271 3272 vnode_list_lock(); 3273 3274 if ( !TAILQ_EMPTY(&vnode_dead_list)) { 3275 /* 3276 * Can always reuse a dead one 3277 */ 3278 vp = TAILQ_FIRST(&vnode_dead_list); 3279 goto steal_this_vp; 3280 } 3281 /* 3282 * no dead vnodes available... if we're under 3283 * the limit, we'll create a new vnode 3284 */ 3285 if (numvnodes < desiredvnodes || force_alloc) { 3286 numvnodes++; 3287 vnode_list_unlock(); 3288 MALLOC_ZONE(vp, struct vnode *, sizeof(*vp), M_VNODE, M_WAITOK); 3289 bzero((char *)vp, sizeof(*vp)); 3290 VLISTNONE(vp); /* avoid double queue removal */ 3291 lck_mtx_init(&vp->v_lock, vnode_lck_grp, vnode_lck_attr); 3292 3293 nanouptime(&ts); 3294 vp->v_id = ts.tv_nsec; 3295 vp->v_flag = VSTANDARD; 3296 3297#if CONFIG_MACF 3298 mac_vnode_label_init(vp); 3299#endif /* MAC */ 3300 3301 vp->v_iocount = 1; 3302 goto done; 3303 } 3304 3305#define MAX_WALK_COUNT 1000 3306 3307 if ( !TAILQ_EMPTY(&vnode_rage_list) && 3308 (ragevnodes >= rage_limit || 3309 (current_tv.tv_sec - rage_tv.tv_sec) >= RAGE_TIME_LIMIT)) { 3310 3311 TAILQ_FOREACH(vp, &vnode_rage_list, v_freelist) { 3312 if ( !(vp->v_listflag & VLIST_RAGE) || !(vp->v_flag & VRAGE)) 3313 panic("new_vnode: vp on RAGE list not marked both VLIST_RAGE and VRAGE"); 3314 3315 // if we're a dependency-capable process, skip vnodes that can 3316 // cause recycling deadlocks. (i.e. this process is diskimages 3317 // helper and the vnode is in a disk image). 3318 // 3319 if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || vp->v_mount->mnt_dependent_process == NULL) { 3320 break; 3321 } 3322 3323 // don't iterate more than MAX_WALK_COUNT vnodes to 3324 // avoid keeping the vnode list lock held for too long. 3325 if (walk_count++ > MAX_WALK_COUNT) { 3326 vp = NULL; 3327 break; 3328 } 3329 } 3330 3331 } 3332 3333 if (vp == NULL && !TAILQ_EMPTY(&vnode_free_list)) { 3334 /* 3335 * Pick the first vp for possible reuse 3336 */ 3337 walk_count = 0; 3338 TAILQ_FOREACH(vp, &vnode_free_list, v_freelist) { 3339 // if we're a dependency-capable process, skip vnodes that can 3340 // cause recycling deadlocks. (i.e. this process is diskimages 3341 // helper and the vnode is in a disk image) 3342 // 3343 if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || vp->v_mount->mnt_dependent_process == NULL) { 3344 break; 3345 } 3346 3347 // don't iterate more than MAX_WALK_COUNT vnodes to 3348 // avoid keeping the vnode list lock held for too long. 3349 if (walk_count++ > MAX_WALK_COUNT) { 3350 vp = NULL; 3351 break; 3352 } 3353 } 3354 3355 } 3356 3357 // 3358 // if we don't have a vnode and the walk_count is >= MAX_WALK_COUNT 3359 // then we're trying to create a vnode on behalf of a 3360 // process like diskimages-helper that has file systems 3361 // mounted on top of itself (and thus we can't reclaim 3362 // vnodes in the file systems on top of us). if we can't 3363 // find a vnode to reclaim then we'll just have to force 3364 // the allocation. 3365 // 3366 if (vp == NULL && walk_count >= MAX_WALK_COUNT) { 3367 force_alloc = 1; 3368 vnode_list_unlock(); 3369 goto retry; 3370 } 3371 3372 if (vp == NULL) { 3373 /* 3374 * we've reached the system imposed maximum number of vnodes 3375 * but there isn't a single one available 3376 * wait a bit and then retry... if we can't get a vnode 3377 * after 100 retries, than log a complaint 3378 */ 3379 if (++retries <= 100) { 3380 vnode_list_unlock(); 3381 delay_for_interval(1, 1000 * 1000); 3382 goto retry; 3383 } 3384 3385 vnode_list_unlock(); 3386 tablefull("vnode"); 3387 log(LOG_EMERG, "%d desired, %d numvnodes, " 3388 "%d free, %d dead, %d rage\n", 3389 desiredvnodes, numvnodes, freevnodes, deadvnodes, ragevnodes); 3390#if CONFIG_EMBEDDED 3391 /* 3392 * Running out of vnodes tends to make a system unusable. On an 3393 * embedded system, it's unlikely that the user can do anything 3394 * about it (or would know what to do, if they could). So panic 3395 * the system so it will automatically restart (and hopefully we 3396 * can get a panic log that tells us why we ran out). 3397 */ 3398 panic("vnode table is full\n"); 3399#endif 3400 *vpp = NULL; 3401 return (ENFILE); 3402 } 3403steal_this_vp: 3404 vpid = vp->v_id; 3405 3406 vnode_list_remove_locked(vp); 3407 3408 vnode_list_unlock(); 3409 vnode_lock_spin(vp); 3410 3411 /* 3412 * We could wait for the vnode_lock after removing the vp from the freelist 3413 * and the vid is bumped only at the very end of reclaim. So it is possible 3414 * that we are looking at a vnode that is being terminated. If so skip it. 3415 */ 3416 if ((vpid != vp->v_id) || (vp->v_usecount != 0) || (vp->v_iocount != 0) || 3417 VONLIST(vp) || (vp->v_lflag & VL_TERMINATE)) { 3418 /* 3419 * we lost the race between dropping the list lock 3420 * and picking up the vnode_lock... someone else 3421 * used this vnode and it is now in a new state 3422 * so we need to go back and try again 3423 */ 3424 vnode_unlock(vp); 3425 goto retry; 3426 } 3427 if ( (vp->v_lflag & (VL_NEEDINACTIVE | VL_MARKTERM)) == VL_NEEDINACTIVE ) { 3428 /* 3429 * we did a vnode_rele_ext that asked for 3430 * us not to reenter the filesystem during 3431 * the release even though VL_NEEDINACTIVE was 3432 * set... we'll do it here by doing a 3433 * vnode_get/vnode_put 3434 * 3435 * pick up an iocount so that we can call 3436 * vnode_put and drive the VNOP_INACTIVE... 3437 * vnode_put will either leave us off 3438 * the freelist if a new ref comes in, 3439 * or put us back on the end of the freelist 3440 * or recycle us if we were marked for termination... 3441 * so we'll just go grab a new candidate 3442 */ 3443 vp->v_iocount++; 3444#ifdef JOE_DEBUG 3445 record_vp(vp, 1); 3446#endif 3447 vnode_put_locked(vp); 3448 vnode_unlock(vp); 3449 goto retry; 3450 } 3451 OSAddAtomic(1, &num_reusedvnodes); 3452 3453 /* Checks for anyone racing us for recycle */ 3454 if (vp->v_type != VBAD) { 3455 if (vp->v_lflag & VL_DEAD) 3456 panic("new_vnode: the vnode is VL_DEAD but not VBAD"); 3457 vnode_lock_convert(vp); 3458 (void)vnode_reclaim_internal(vp, 1, 1, 0); 3459 3460 if ((VONLIST(vp))) 3461 panic("new_vnode: vp on list "); 3462 if (vp->v_usecount || vp->v_iocount || vp->v_kusecount || 3463 (vp->v_lflag & (VNAMED_UBC | VNAMED_MOUNT | VNAMED_FSHASH))) 3464 panic("new_vnode: free vnode still referenced\n"); 3465 if ((vp->v_mntvnodes.tqe_prev != 0) && (vp->v_mntvnodes.tqe_next != 0)) 3466 panic("new_vnode: vnode seems to be on mount list "); 3467 if ( !LIST_EMPTY(&vp->v_nclinks) || !LIST_EMPTY(&vp->v_ncchildren)) 3468 panic("new_vnode: vnode still hooked into the name cache"); 3469 } 3470 if (vp->v_unsafefs) { 3471 l_unsafefs = vp->v_unsafefs; 3472 vp->v_unsafefs = (struct unsafe_fsnode *)NULL; 3473 } 3474 3475#if CONFIG_MACF 3476 /* 3477 * We should never see VL_LABELWAIT or VL_LABEL here. 3478 * as those operations hold a reference. 3479 */ 3480 assert ((vp->v_lflag & VL_LABELWAIT) != VL_LABELWAIT); 3481 assert ((vp->v_lflag & VL_LABEL) != VL_LABEL); 3482 if (vp->v_lflag & VL_LABELED) { 3483 vnode_lock_convert(vp); 3484 mac_vnode_label_recycle(vp); 3485 } 3486#endif /* MAC */ 3487 3488 vp->v_iocount = 1; 3489 vp->v_lflag = 0; 3490 vp->v_writecount = 0; 3491 vp->v_references = 0; 3492 vp->v_iterblkflags = 0; 3493 vp->v_flag = VSTANDARD; 3494 /* vbad vnodes can point to dead_mountp */ 3495 vp->v_mount = NULL; 3496 vp->v_defer_reclaimlist = (vnode_t)0; 3497 3498 vnode_unlock(vp); 3499 3500 if (l_unsafefs) { 3501 lck_mtx_destroy(&l_unsafefs->fsnodelock, vnode_lck_grp); 3502 FREE_ZONE((void *)l_unsafefs, sizeof(struct unsafe_fsnode), M_UNSAFEFS); 3503 } 3504done: 3505 *vpp = vp; 3506 3507 return (0); 3508} 3509 3510void 3511vnode_lock(vnode_t vp) 3512{ 3513 lck_mtx_lock(&vp->v_lock); 3514} 3515 3516void 3517vnode_lock_spin(vnode_t vp) 3518{ 3519 lck_mtx_lock_spin(&vp->v_lock); 3520} 3521 3522void 3523vnode_unlock(vnode_t vp) 3524{ 3525 lck_mtx_unlock(&vp->v_lock); 3526} 3527 3528 3529 3530int 3531vnode_get(struct vnode *vp) 3532{ 3533 int retval; 3534 3535 vnode_lock_spin(vp); 3536 retval = vnode_get_locked(vp); 3537 vnode_unlock(vp); 3538 3539 return(retval); 3540} 3541 3542int 3543vnode_get_locked(struct vnode *vp) 3544{ 3545 3546 if ((vp->v_iocount == 0) && (vp->v_lflag & (VL_TERMINATE | VL_DEAD))) { 3547 return(ENOENT); 3548 } 3549 vp->v_iocount++; 3550#ifdef JOE_DEBUG 3551 record_vp(vp, 1); 3552#endif 3553 return (0); 3554} 3555 3556int 3557vnode_getwithvid(vnode_t vp, int vid) 3558{ 3559 return(vget_internal(vp, vid, ( VNODE_NODEAD| VNODE_WITHID))); 3560} 3561 3562int 3563vnode_getwithref(vnode_t vp) 3564{ 3565 return(vget_internal(vp, 0, 0)); 3566} 3567 3568 3569__private_extern__ int 3570vnode_getalways(vnode_t vp) 3571{ 3572 return(vget_internal(vp, 0, VNODE_ALWAYS)); 3573} 3574 3575int 3576vnode_put(vnode_t vp) 3577{ 3578 int retval; 3579 3580 vnode_lock_spin(vp); 3581 retval = vnode_put_locked(vp); 3582 vnode_unlock(vp); 3583 3584 return(retval); 3585} 3586 3587int 3588vnode_put_locked(vnode_t vp) 3589{ 3590 vfs_context_t ctx = vfs_context_current(); /* hoist outside loop */ 3591 3592retry: 3593 if (vp->v_iocount < 1) 3594 panic("vnode_put(%p): iocount < 1", vp); 3595 3596 if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) { 3597 vnode_dropiocount(vp); 3598 return(0); 3599 } 3600 if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD | VL_NEEDINACTIVE)) == VL_NEEDINACTIVE) { 3601 3602 vp->v_lflag &= ~VL_NEEDINACTIVE; 3603 vnode_unlock(vp); 3604 3605 VNOP_INACTIVE(vp, ctx); 3606 3607 vnode_lock_spin(vp); 3608 /* 3609 * because we had to drop the vnode lock before calling 3610 * VNOP_INACTIVE, the state of this vnode may have changed... 3611 * we may pick up both VL_MARTERM and either 3612 * an iocount or a usecount while in the VNOP_INACTIVE call 3613 * we don't want to call vnode_reclaim_internal on a vnode 3614 * that has active references on it... so loop back around 3615 * and reevaluate the state 3616 */ 3617 goto retry; 3618 } 3619 vp->v_lflag &= ~VL_NEEDINACTIVE; 3620 3621 if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM) { 3622 vnode_lock_convert(vp); 3623 vnode_reclaim_internal(vp, 1, 1, 0); 3624 } 3625 vnode_dropiocount(vp); 3626 vnode_list_add(vp); 3627 3628 return(0); 3629} 3630 3631/* is vnode_t in use by others? */ 3632int 3633vnode_isinuse(vnode_t vp, int refcnt) 3634{ 3635 return(vnode_isinuse_locked(vp, refcnt, 0)); 3636} 3637 3638 3639static int 3640vnode_isinuse_locked(vnode_t vp, int refcnt, int locked) 3641{ 3642 int retval = 0; 3643 3644 if (!locked) 3645 vnode_lock_spin(vp); 3646 if ((vp->v_type != VREG) && ((vp->v_usecount - vp->v_kusecount) > refcnt)) { 3647 retval = 1; 3648 goto out; 3649 } 3650 if (vp->v_type == VREG) { 3651 retval = ubc_isinuse_locked(vp, refcnt, 1); 3652 } 3653 3654out: 3655 if (!locked) 3656 vnode_unlock(vp); 3657 return(retval); 3658} 3659 3660 3661/* resume vnode_t */ 3662errno_t 3663vnode_resume(vnode_t vp) 3664{ 3665 3666 vnode_lock_spin(vp); 3667 3668 if (vp->v_owner == current_thread()) { 3669 vp->v_lflag &= ~VL_SUSPENDED; 3670 vp->v_owner = NULL; 3671 vnode_unlock(vp); 3672 wakeup(&vp->v_iocount); 3673 } else 3674 vnode_unlock(vp); 3675 3676 return(0); 3677} 3678 3679/* suspend vnode_t 3680 * Please do not use on more than one vnode at a time as it may 3681 * cause deadlocks. 3682 * xxx should we explicity prevent this from happening? 3683 */ 3684 3685errno_t 3686vnode_suspend(vnode_t vp) 3687{ 3688 if (vp->v_lflag & VL_SUSPENDED) { 3689 return(EBUSY); 3690 } 3691 3692 vnode_lock_spin(vp); 3693 3694 /* 3695 * xxx is this sufficient to check if a vnode_drain is 3696 * progress? 3697 */ 3698 3699 if (vp->v_owner == NULL) { 3700 vp->v_lflag |= VL_SUSPENDED; 3701 vp->v_owner = current_thread(); 3702 } 3703 vnode_unlock(vp); 3704 3705 return(0); 3706} 3707 3708 3709 3710static errno_t 3711vnode_drain(vnode_t vp) 3712{ 3713 3714 if (vp->v_lflag & VL_DRAIN) { 3715 panic("vnode_drain: recursuve drain"); 3716 return(ENOENT); 3717 } 3718 vp->v_lflag |= VL_DRAIN; 3719 vp->v_owner = current_thread(); 3720 3721 while (vp->v_iocount > 1) 3722 msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_drain", NULL); 3723 return(0); 3724} 3725 3726 3727/* 3728 * if the number of recent references via vnode_getwithvid or vnode_getwithref 3729 * exceeds this threshhold, than 'UN-AGE' the vnode by removing it from 3730 * the LRU list if it's currently on it... once the iocount and usecount both drop 3731 * to 0, it will get put back on the end of the list, effectively making it younger 3732 * this allows us to keep actively referenced vnodes in the list without having 3733 * to constantly remove and add to the list each time a vnode w/o a usecount is 3734 * referenced which costs us taking and dropping a global lock twice. 3735 */ 3736#define UNAGE_THRESHHOLD 25 3737 3738static errno_t 3739vnode_getiocount(vnode_t vp, int vid, int vflags) 3740{ 3741 int nodead = vflags & VNODE_NODEAD; 3742 int nosusp = vflags & VNODE_NOSUSPEND; 3743 int always = vflags & VNODE_ALWAYS; 3744 3745 for (;;) { 3746 /* 3747 * if it is a dead vnode with deadfs 3748 */ 3749 if (nodead && (vp->v_lflag & VL_DEAD) && ((vp->v_type == VBAD) || (vp->v_data == 0))) { 3750 return(ENOENT); 3751 } 3752 /* 3753 * will return VL_DEAD ones 3754 */ 3755 if ((vp->v_lflag & (VL_SUSPENDED | VL_DRAIN | VL_TERMINATE)) == 0 ) { 3756 break; 3757 } 3758 /* 3759 * if suspended vnodes are to be failed 3760 */ 3761 if (nosusp && (vp->v_lflag & VL_SUSPENDED)) { 3762 return(ENOENT); 3763 } 3764 /* 3765 * if you are the owner of drain/suspend/termination , can acquire iocount 3766 * check for VL_TERMINATE; it does not set owner 3767 */ 3768 if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED | VL_TERMINATE)) && 3769 (vp->v_owner == current_thread())) { 3770 break; 3771 } 3772 if (always != 0) 3773 break; 3774 vnode_lock_convert(vp); 3775 3776 if (vp->v_lflag & VL_TERMINATE) { 3777 vp->v_lflag |= VL_TERMWANT; 3778 3779 msleep(&vp->v_lflag, &vp->v_lock, PVFS, "vnode getiocount", NULL); 3780 } else 3781 msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_getiocount", NULL); 3782 } 3783 if (vid != vp->v_id) { 3784 return(ENOENT); 3785 } 3786 if (++vp->v_references >= UNAGE_THRESHHOLD) { 3787 vp->v_references = 0; 3788 vnode_list_remove(vp); 3789 } 3790 vp->v_iocount++; 3791#ifdef JOE_DEBUG 3792 record_vp(vp, 1); 3793#endif 3794 return(0); 3795} 3796 3797static void 3798vnode_dropiocount (vnode_t vp) 3799{ 3800 if (vp->v_iocount < 1) 3801 panic("vnode_dropiocount(%p): v_iocount < 1", vp); 3802 3803 vp->v_iocount--; 3804#ifdef JOE_DEBUG 3805 record_vp(vp, -1); 3806#endif 3807 if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED)) && (vp->v_iocount <= 1)) { 3808 vnode_lock_convert(vp); 3809 wakeup(&vp->v_iocount); 3810 } 3811} 3812 3813 3814void 3815vnode_reclaim(struct vnode * vp) 3816{ 3817 vnode_reclaim_internal(vp, 0, 0, 0); 3818} 3819 3820__private_extern__ 3821void 3822vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags) 3823{ 3824 int isfifo = 0; 3825 3826 if (!locked) 3827 vnode_lock(vp); 3828 3829 if (vp->v_lflag & VL_TERMINATE) { 3830 panic("vnode reclaim in progress"); 3831 } 3832 vp->v_lflag |= VL_TERMINATE; 3833 3834 vn_clearunionwait(vp, 1); 3835 3836 if (vnode_drain(vp)) { 3837 panic("vnode drain failed"); 3838 vnode_unlock(vp); 3839 return; 3840 } 3841 isfifo = (vp->v_type == VFIFO); 3842 3843 if (vp->v_type != VBAD) 3844 vgone(vp, flags); /* clean and reclaim the vnode */ 3845 3846 /* 3847 * give the vnode a new identity so that vnode_getwithvid will fail 3848 * on any stale cache accesses... 3849 * grab the list_lock so that if we're in "new_vnode" 3850 * behind the list_lock trying to steal this vnode, the v_id is stable... 3851 * once new_vnode drops the list_lock, it will block trying to take 3852 * the vnode lock until we release it... at that point it will evaluate 3853 * whether the v_vid has changed 3854 * also need to make sure that the vnode isn't on a list where "new_vnode" 3855 * can find it after the v_id has been bumped until we are completely done 3856 * with the vnode (i.e. putting it back on a list has to be the very last 3857 * thing we do to this vnode... many of the callers of vnode_reclaim_internal 3858 * are holding an io_count on the vnode... they need to drop the io_count 3859 * BEFORE doing a vnode_list_add or make sure to hold the vnode lock until 3860 * they are completely done with the vnode 3861 */ 3862 vnode_list_lock(); 3863 3864 vnode_list_remove_locked(vp); 3865 vp->v_id++; 3866 3867 vnode_list_unlock(); 3868 3869 if (isfifo) { 3870 struct fifoinfo * fip; 3871 3872 fip = vp->v_fifoinfo; 3873 vp->v_fifoinfo = NULL; 3874 FREE(fip, M_TEMP); 3875 } 3876 3877 vp->v_type = VBAD; 3878 3879 if (vp->v_data) 3880 panic("vnode_reclaim_internal: cleaned vnode isn't"); 3881 if (vp->v_numoutput) 3882 panic("vnode_reclaim_internal: clean vnode has pending I/O's"); 3883 if (UBCINFOEXISTS(vp)) 3884 panic("vnode_reclaim_internal: ubcinfo not cleaned"); 3885 if (vp->v_parent) 3886 panic("vnode_reclaim_internal: vparent not removed"); 3887 if (vp->v_name) 3888 panic("vnode_reclaim_internal: vname not removed"); 3889 3890 vp->v_socket = NULL; 3891 3892 vp->v_lflag &= ~VL_TERMINATE; 3893 vp->v_lflag &= ~VL_DRAIN; 3894 vp->v_owner = NULL; 3895 3896 if (vp->v_lflag & VL_TERMWANT) { 3897 vp->v_lflag &= ~VL_TERMWANT; 3898 wakeup(&vp->v_lflag); 3899 } 3900 if (!reuse) { 3901 /* 3902 * make sure we get on the 3903 * dead list if appropriate 3904 */ 3905 vnode_list_add(vp); 3906 } 3907 if (!locked) 3908 vnode_unlock(vp); 3909} 3910 3911/* USAGE: 3912 * vnode_create(int flavor, size_t size, void * param, vnode_t *vp) 3913 */ 3914int 3915vnode_create(int flavor, size_t size, void *data, vnode_t *vpp) 3916{ 3917 int error; 3918 int insert = 1; 3919 vnode_t vp; 3920 vnode_t nvp; 3921 vnode_t dvp; 3922 struct uthread *ut; 3923 struct componentname *cnp; 3924 struct vnode_fsparam *param = (struct vnode_fsparam *)data; 3925 3926 if (flavor == VNCREATE_FLAVOR && (size == VCREATESIZE) && param) { 3927 if ( (error = new_vnode(&vp)) ) { 3928 return(error); 3929 } else { 3930 dvp = param->vnfs_dvp; 3931 cnp = param->vnfs_cnp; 3932 3933 vp->v_op = param->vnfs_vops; 3934 vp->v_type = param->vnfs_vtype; 3935 vp->v_data = param->vnfs_fsnode; 3936 3937 if (param->vnfs_markroot) 3938 vp->v_flag |= VROOT; 3939 if (param->vnfs_marksystem) 3940 vp->v_flag |= VSYSTEM; 3941 if (vp->v_type == VREG) { 3942 error = ubc_info_init_withsize(vp, param->vnfs_filesize); 3943 if (error) { 3944#ifdef JOE_DEBUG 3945 record_vp(vp, 1); 3946#endif 3947 vp->v_mount = NULL; 3948 vp->v_op = dead_vnodeop_p; 3949 vp->v_tag = VT_NON; 3950 vp->v_data = NULL; 3951 vp->v_type = VBAD; 3952 vp->v_lflag |= VL_DEAD; 3953 3954 vnode_put(vp); 3955 return(error); 3956 } 3957 } 3958#ifdef JOE_DEBUG 3959 record_vp(vp, 1); 3960#endif 3961 if (vp->v_type == VCHR || vp->v_type == VBLK) { 3962 3963 vp->v_tag = VT_DEVFS; /* callers will reset if needed (bdevvp) */ 3964 3965 if ( (nvp = checkalias(vp, param->vnfs_rdev)) ) { 3966 /* 3967 * if checkalias returns a vnode, it will be locked 3968 * 3969 * first get rid of the unneeded vnode we acquired 3970 */ 3971 vp->v_data = NULL; 3972 vp->v_op = spec_vnodeop_p; 3973 vp->v_type = VBAD; 3974 vp->v_lflag = VL_DEAD; 3975 vp->v_data = NULL; 3976 vp->v_tag = VT_NON; 3977 vnode_put(vp); 3978 3979 /* 3980 * switch to aliased vnode and finish 3981 * preparing it 3982 */ 3983 vp = nvp; 3984 3985 vclean(vp, 0); 3986 vp->v_op = param->vnfs_vops; 3987 vp->v_type = param->vnfs_vtype; 3988 vp->v_data = param->vnfs_fsnode; 3989 vp->v_lflag = 0; 3990 vp->v_mount = NULL; 3991 insmntque(vp, param->vnfs_mp); 3992 insert = 0; 3993 vnode_unlock(vp); 3994 } 3995 } 3996 3997 if (vp->v_type == VFIFO) { 3998 struct fifoinfo *fip; 3999 4000 MALLOC(fip, struct fifoinfo *, 4001 sizeof(*fip), M_TEMP, M_WAITOK); 4002 bzero(fip, sizeof(struct fifoinfo )); 4003 vp->v_fifoinfo = fip; 4004 } 4005 /* The file systems usually pass the address of the location where 4006 * where there store the vnode pointer. When we add the vnode in mount 4007 * point and name cache they are discoverable. So the file system node 4008 * will have the connection to vnode setup by then 4009 */ 4010 *vpp = vp; 4011 4012 /* Add fs named reference. */ 4013 if (param->vnfs_flags & VNFS_ADDFSREF) { 4014 vp->v_lflag |= VNAMED_FSHASH; 4015 } 4016 if (param->vnfs_mp) { 4017 if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL) 4018 vp->v_flag |= VLOCKLOCAL; 4019 if (insert) { 4020 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) 4021 panic("insmntque: vp on the free list\n"); 4022 4023 /* 4024 * enter in mount vnode list 4025 */ 4026 insmntque(vp, param->vnfs_mp); 4027 } 4028#ifdef INTERIM_FSNODE_LOCK 4029 if (param->vnfs_mp->mnt_vtable->vfc_threadsafe == 0) { 4030 MALLOC_ZONE(vp->v_unsafefs, struct unsafe_fsnode *, 4031 sizeof(struct unsafe_fsnode), M_UNSAFEFS, M_WAITOK); 4032 vp->v_unsafefs->fsnode_count = 0; 4033 vp->v_unsafefs->fsnodeowner = (void *)NULL; 4034 lck_mtx_init(&vp->v_unsafefs->fsnodelock, vnode_lck_grp, vnode_lck_attr); 4035 } 4036#endif /* INTERIM_FSNODE_LOCK */ 4037 } 4038 if (dvp && vnode_ref(dvp) == 0) { 4039 vp->v_parent = dvp; 4040 } 4041 if (cnp) { 4042 if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE | VNFS_CANTCACHE)) == 0)) { 4043 /* 4044 * enter into name cache 4045 * we've got the info to enter it into the name cache now 4046 */ 4047 cache_enter(dvp, vp, cnp); 4048 } 4049 vp->v_name = vfs_addname(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0); 4050 if ((cnp->cn_flags & UNIONCREATED) == UNIONCREATED) 4051 vp->v_flag |= VISUNION; 4052 } 4053 if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) { 4054 /* 4055 * this vnode is being created as cacheable in the name cache 4056 * this allows us to re-enter it in the cache 4057 */ 4058 vp->v_flag |= VNCACHEABLE; 4059 } 4060 ut = get_bsdthread_info(current_thread()); 4061 4062 if ((current_proc()->p_lflag & P_LRAGE_VNODES) || 4063 (ut->uu_flag & UT_RAGE_VNODES)) { 4064 /* 4065 * process has indicated that it wants any 4066 * vnodes created on its behalf to be rapidly 4067 * aged to reduce the impact on the cached set 4068 * of vnodes 4069 */ 4070 vp->v_flag |= VRAGE; 4071 } 4072 return(0); 4073 } 4074 } 4075 return (EINVAL); 4076} 4077 4078int 4079vnode_addfsref(vnode_t vp) 4080{ 4081 vnode_lock_spin(vp); 4082 if (vp->v_lflag & VNAMED_FSHASH) 4083 panic("add_fsref: vp already has named reference"); 4084 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) 4085 panic("addfsref: vp on the free list\n"); 4086 vp->v_lflag |= VNAMED_FSHASH; 4087 vnode_unlock(vp); 4088 return(0); 4089 4090} 4091int 4092vnode_removefsref(vnode_t vp) 4093{ 4094 vnode_lock_spin(vp); 4095 if ((vp->v_lflag & VNAMED_FSHASH) == 0) 4096 panic("remove_fsref: no named reference"); 4097 vp->v_lflag &= ~VNAMED_FSHASH; 4098 vnode_unlock(vp); 4099 return(0); 4100 4101} 4102 4103 4104int 4105vfs_iterate(__unused int flags, int (*callout)(mount_t, void *), void *arg) 4106{ 4107 mount_t mp; 4108 int ret = 0; 4109 fsid_t * fsid_list; 4110 int count, actualcount, i; 4111 void * allocmem; 4112 4113 count = mount_getvfscnt(); 4114 count += 10; 4115 4116 fsid_list = (fsid_t *)kalloc(count * sizeof(fsid_t)); 4117 allocmem = (void *)fsid_list; 4118 4119 actualcount = mount_fillfsids(fsid_list, count); 4120 4121 for (i=0; i< actualcount; i++) { 4122 4123 /* obtain the mount point with iteration reference */ 4124 mp = mount_list_lookupby_fsid(&fsid_list[i], 0, 1); 4125 4126 if(mp == (struct mount *)0) 4127 continue; 4128 mount_lock(mp); 4129 if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) { 4130 mount_unlock(mp); 4131 mount_iterdrop(mp); 4132 continue; 4133 4134 } 4135 mount_unlock(mp); 4136 4137 /* iterate over all the vnodes */ 4138 ret = callout(mp, arg); 4139 4140 mount_iterdrop(mp); 4141 4142 switch (ret) { 4143 case VFS_RETURNED: 4144 case VFS_RETURNED_DONE: 4145 if (ret == VFS_RETURNED_DONE) { 4146 ret = 0; 4147 goto out; 4148 } 4149 break; 4150 4151 case VFS_CLAIMED_DONE: 4152 ret = 0; 4153 goto out; 4154 case VFS_CLAIMED: 4155 default: 4156 break; 4157 } 4158 ret = 0; 4159 } 4160 4161out: 4162 kfree(allocmem, (count * sizeof(fsid_t))); 4163 return (ret); 4164} 4165 4166/* 4167 * Update the vfsstatfs structure in the mountpoint. 4168 * MAC: Parameter eventtype added, indicating whether the event that 4169 * triggered this update came from user space, via a system call 4170 * (VFS_USER_EVENT) or an internal kernel call (VFS_KERNEL_EVENT). 4171 */ 4172int 4173vfs_update_vfsstat(mount_t mp, vfs_context_t ctx, __unused int eventtype) 4174{ 4175 struct vfs_attr va; 4176 int error; 4177 4178 /* 4179 * Request the attributes we want to propagate into 4180 * the per-mount vfsstat structure. 4181 */ 4182 VFSATTR_INIT(&va); 4183 VFSATTR_WANTED(&va, f_iosize); 4184 VFSATTR_WANTED(&va, f_blocks); 4185 VFSATTR_WANTED(&va, f_bfree); 4186 VFSATTR_WANTED(&va, f_bavail); 4187 VFSATTR_WANTED(&va, f_bused); 4188 VFSATTR_WANTED(&va, f_files); 4189 VFSATTR_WANTED(&va, f_ffree); 4190 VFSATTR_WANTED(&va, f_bsize); 4191 VFSATTR_WANTED(&va, f_fssubtype); 4192#if CONFIG_MACF 4193 if (eventtype == VFS_USER_EVENT) { 4194 error = mac_mount_check_getattr(ctx, mp, &va); 4195 if (error != 0) 4196 return (error); 4197 } 4198#endif 4199 4200 if ((error = vfs_getattr(mp, &va, ctx)) != 0) { 4201 KAUTH_DEBUG("STAT - filesystem returned error %d", error); 4202 return(error); 4203 } 4204 4205 /* 4206 * Unpack into the per-mount structure. 4207 * 4208 * We only overwrite these fields, which are likely to change: 4209 * f_blocks 4210 * f_bfree 4211 * f_bavail 4212 * f_bused 4213 * f_files 4214 * f_ffree 4215 * 4216 * And these which are not, but which the FS has no other way 4217 * of providing to us: 4218 * f_bsize 4219 * f_iosize 4220 * f_fssubtype 4221 * 4222 */ 4223 if (VFSATTR_IS_SUPPORTED(&va, f_bsize)) { 4224 /* 4822056 - protect against malformed server mount */ 4225 mp->mnt_vfsstat.f_bsize = (va.f_bsize > 0 ? va.f_bsize : 512); 4226 } else { 4227 mp->mnt_vfsstat.f_bsize = mp->mnt_devblocksize; /* default from the device block size */ 4228 } 4229 if (VFSATTR_IS_SUPPORTED(&va, f_iosize)) { 4230 mp->mnt_vfsstat.f_iosize = va.f_iosize; 4231 } else { 4232 mp->mnt_vfsstat.f_iosize = 1024 * 1024; /* 1MB sensible I/O size */ 4233 } 4234 if (VFSATTR_IS_SUPPORTED(&va, f_blocks)) 4235 mp->mnt_vfsstat.f_blocks = va.f_blocks; 4236 if (VFSATTR_IS_SUPPORTED(&va, f_bfree)) 4237 mp->mnt_vfsstat.f_bfree = va.f_bfree; 4238 if (VFSATTR_IS_SUPPORTED(&va, f_bavail)) 4239 mp->mnt_vfsstat.f_bavail = va.f_bavail; 4240 if (VFSATTR_IS_SUPPORTED(&va, f_bused)) 4241 mp->mnt_vfsstat.f_bused = va.f_bused; 4242 if (VFSATTR_IS_SUPPORTED(&va, f_files)) 4243 mp->mnt_vfsstat.f_files = va.f_files; 4244 if (VFSATTR_IS_SUPPORTED(&va, f_ffree)) 4245 mp->mnt_vfsstat.f_ffree = va.f_ffree; 4246 4247 /* this is unlikely to change, but has to be queried for */ 4248 if (VFSATTR_IS_SUPPORTED(&va, f_fssubtype)) 4249 mp->mnt_vfsstat.f_fssubtype = va.f_fssubtype; 4250 4251 return(0); 4252} 4253 4254void 4255mount_list_add(mount_t mp) 4256{ 4257 mount_list_lock(); 4258 TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); 4259 nummounts++; 4260 mount_list_unlock(); 4261} 4262 4263void 4264mount_list_remove(mount_t mp) 4265{ 4266 mount_list_lock(); 4267 TAILQ_REMOVE(&mountlist, mp, mnt_list); 4268 nummounts--; 4269 mp->mnt_list.tqe_next = NULL; 4270 mp->mnt_list.tqe_prev = NULL; 4271 mount_list_unlock(); 4272} 4273 4274#if CONFIG_VOLFS 4275mount_t 4276mount_lookupby_volfsid(int volfs_id, int withref) 4277{ 4278 mount_t cur_mount = (mount_t)0; 4279 mount_t mp; 4280 4281 mount_list_lock(); 4282 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4283 if (!(mp->mnt_kern_flag & MNTK_UNMOUNT) && 4284 (mp->mnt_kern_flag & MNTK_PATH_FROM_ID) && 4285 (mp->mnt_vfsstat.f_fsid.val[0] == volfs_id)) { 4286 cur_mount = mp; 4287 if (withref) { 4288 if (mount_iterref(cur_mount, 1)) { 4289 cur_mount = (mount_t)0; 4290 mount_list_unlock(); 4291 goto out; 4292 } 4293 } 4294 break; 4295 } 4296 } 4297 mount_list_unlock(); 4298 if (withref && (cur_mount != (mount_t)0)) { 4299 mp = cur_mount; 4300 if (vfs_busy(mp, LK_NOWAIT) != 0) { 4301 cur_mount = (mount_t)0; 4302 } 4303 mount_iterdrop(mp); 4304 } 4305out: 4306 return(cur_mount); 4307} 4308#endif 4309 4310 4311mount_t 4312mount_list_lookupby_fsid(fsid_t *fsid, int locked, int withref) 4313{ 4314 mount_t retmp = (mount_t)0; 4315 mount_t mp; 4316 4317 if (!locked) 4318 mount_list_lock(); 4319 TAILQ_FOREACH(mp, &mountlist, mnt_list) 4320 if (mp->mnt_vfsstat.f_fsid.val[0] == fsid->val[0] && 4321 mp->mnt_vfsstat.f_fsid.val[1] == fsid->val[1]) { 4322 retmp = mp; 4323 if (withref) { 4324 if (mount_iterref(retmp, 1)) 4325 retmp = (mount_t)0; 4326 } 4327 goto out; 4328 } 4329out: 4330 if (!locked) 4331 mount_list_unlock(); 4332 return (retmp); 4333} 4334 4335errno_t 4336vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx) 4337{ 4338 struct nameidata nd; 4339 int error; 4340 u_long ndflags = 0; 4341 4342 if (ctx == NULL) { /* XXX technically an error */ 4343 ctx = vfs_context_current(); 4344 } 4345 4346 if (flags & VNODE_LOOKUP_NOFOLLOW) 4347 ndflags = NOFOLLOW; 4348 else 4349 ndflags = FOLLOW; 4350 4351 if (flags & VNODE_LOOKUP_NOCROSSMOUNT) 4352 ndflags |= NOCROSSMOUNT; 4353 if (flags & VNODE_LOOKUP_DOWHITEOUT) 4354 ndflags |= DOWHITEOUT; 4355 4356 /* XXX AUDITVNPATH1 needed ? */ 4357 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); 4358 4359 if ((error = namei(&nd))) 4360 return (error); 4361 *vpp = nd.ni_vp; 4362 nameidone(&nd); 4363 4364 return (0); 4365} 4366 4367errno_t 4368vnode_open(const char *path, int fmode, int cmode, int flags, vnode_t *vpp, vfs_context_t ctx) 4369{ 4370 struct nameidata nd; 4371 int error; 4372 u_long ndflags = 0; 4373 int lflags = flags; 4374 4375 if (ctx == NULL) { /* XXX technically an error */ 4376 ctx = vfs_context_current(); 4377 } 4378 4379 if (fmode & O_NOFOLLOW) 4380 lflags |= VNODE_LOOKUP_NOFOLLOW; 4381 4382 if (lflags & VNODE_LOOKUP_NOFOLLOW) 4383 ndflags = NOFOLLOW; 4384 else 4385 ndflags = FOLLOW; 4386 4387 if (lflags & VNODE_LOOKUP_NOCROSSMOUNT) 4388 ndflags |= NOCROSSMOUNT; 4389 if (lflags & VNODE_LOOKUP_DOWHITEOUT) 4390 ndflags |= DOWHITEOUT; 4391 4392 /* XXX AUDITVNPATH1 needed ? */ 4393 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); 4394 4395 if ((error = vn_open(&nd, fmode, cmode))) 4396 *vpp = NULL; 4397 else 4398 *vpp = nd.ni_vp; 4399 4400 return (error); 4401} 4402 4403errno_t 4404vnode_close(vnode_t vp, int flags, vfs_context_t ctx) 4405{ 4406 int error; 4407 4408 if (ctx == NULL) { 4409 ctx = vfs_context_current(); 4410 } 4411 4412 error = vn_close(vp, flags, ctx); 4413 vnode_put(vp); 4414 return (error); 4415} 4416 4417/* 4418 * Returns: 0 Success 4419 * vnode_getattr:??? 4420 */ 4421errno_t 4422vnode_size(vnode_t vp, off_t *sizep, vfs_context_t ctx) 4423{ 4424 struct vnode_attr va; 4425 int error; 4426 4427 VATTR_INIT(&va); 4428 VATTR_WANTED(&va, va_data_size); 4429 error = vnode_getattr(vp, &va, ctx); 4430 if (!error) 4431 *sizep = va.va_data_size; 4432 return(error); 4433} 4434 4435errno_t 4436vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx) 4437{ 4438 struct vnode_attr va; 4439 4440 VATTR_INIT(&va); 4441 VATTR_SET(&va, va_data_size, size); 4442 va.va_vaflags = ioflag & 0xffff; 4443 return(vnode_setattr(vp, &va, ctx)); 4444} 4445 4446/* 4447 * Create a filesystem object of arbitrary type with arbitrary attributes in 4448 * the spevied directory with the specified name. 4449 * 4450 * Parameters: dvp Pointer to the vnode of the directory 4451 * in which to create the object. 4452 * vpp Pointer to the area into which to 4453 * return the vnode of the created object. 4454 * cnp Component name pointer from the namei 4455 * data structure, containing the name to 4456 * use for the create object. 4457 * vap Pointer to the vnode_attr structure 4458 * describing the object to be created, 4459 * including the type of object. 4460 * flags VN_* flags controlling ACL inheritance 4461 * and whether or not authorization is to 4462 * be required for the operation. 4463 * 4464 * Returns: 0 Success 4465 * !0 errno value 4466 * 4467 * Implicit: *vpp Contains the vnode of the object that 4468 * was created, if successful. 4469 * *cnp May be modified by the underlying VFS. 4470 * *vap May be modified by the underlying VFS. 4471 * modified by either ACL inheritance or 4472 * 4473 * 4474 * be modified, even if the operation is 4475 * 4476 * 4477 * Notes: The kauth_filesec_t in 'vap', if any, is in host byte order. 4478 * 4479 * Modification of '*cnp' and '*vap' by the underlying VFS is 4480 * strongly discouraged. 4481 * 4482 * XXX: This function is a 'vn_*' function; it belongs in vfs_vnops.c 4483 * 4484 * XXX: We should enummerate the possible errno values here, and where 4485 * in the code they originated. 4486 */ 4487errno_t 4488vn_create(vnode_t dvp, vnode_t *vpp, struct componentname *cnp, struct vnode_attr *vap, int flags, vfs_context_t ctx) 4489{ 4490 kauth_acl_t oacl, nacl; 4491 int initial_acl; 4492 errno_t error; 4493 vnode_t vp = (vnode_t)0; 4494 4495 error = 0; 4496 oacl = nacl = NULL; 4497 initial_acl = 0; 4498 4499 KAUTH_DEBUG("%p CREATE - '%s'", dvp, cnp->cn_nameptr); 4500 4501 /* 4502 * Handle ACL inheritance. 4503 */ 4504 if (!(flags & VN_CREATE_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) { 4505 /* save the original filesec */ 4506 if (VATTR_IS_ACTIVE(vap, va_acl)) { 4507 initial_acl = 1; 4508 oacl = vap->va_acl; 4509 } 4510 4511 vap->va_acl = NULL; 4512 if ((error = kauth_acl_inherit(dvp, 4513 oacl, 4514 &nacl, 4515 vap->va_type == VDIR, 4516 ctx)) != 0) { 4517 KAUTH_DEBUG("%p CREATE - error %d processing inheritance", dvp, error); 4518 return(error); 4519 } 4520 4521 /* 4522 * If the generated ACL is NULL, then we can save ourselves some effort 4523 * by clearing the active bit. 4524 */ 4525 if (nacl == NULL) { 4526 VATTR_CLEAR_ACTIVE(vap, va_acl); 4527 } else { 4528 VATTR_SET(vap, va_acl, nacl); 4529 } 4530 } 4531 4532 /* 4533 * Check and default new attributes. 4534 * This will set va_uid, va_gid, va_mode and va_create_time at least, if the caller 4535 * hasn't supplied them. 4536 */ 4537 if ((error = vnode_authattr_new(dvp, vap, flags & VN_CREATE_NOAUTH, ctx)) != 0) { 4538 KAUTH_DEBUG("%p CREATE - error %d handing/defaulting attributes", dvp, error); 4539 goto out; 4540 } 4541 4542 4543 /* 4544 * Create the requested node. 4545 */ 4546 switch(vap->va_type) { 4547 case VREG: 4548 error = VNOP_CREATE(dvp, vpp, cnp, vap, ctx); 4549 break; 4550 case VDIR: 4551 error = VNOP_MKDIR(dvp, vpp, cnp, vap, ctx); 4552 break; 4553 case VSOCK: 4554 case VFIFO: 4555 case VBLK: 4556 case VCHR: 4557 error = VNOP_MKNOD(dvp, vpp, cnp, vap, ctx); 4558 break; 4559 default: 4560 panic("vnode_create: unknown vtype %d", vap->va_type); 4561 } 4562 if (error != 0) { 4563 KAUTH_DEBUG("%p CREATE - error %d returned by filesystem", dvp, error); 4564 goto out; 4565 } 4566 4567 vp = *vpp; 4568#if CONFIG_MACF 4569 if (!(flags & VN_CREATE_NOLABEL)) { 4570 error = vnode_label(vnode_mount(vp), dvp, vp, cnp, 4571 VNODE_LABEL_CREATE|VNODE_LABEL_NEEDREF, ctx); 4572 if (error) 4573 goto error; 4574 } 4575#endif 4576 4577 /* 4578 * If some of the requested attributes weren't handled by the VNOP, 4579 * use our fallback code. 4580 */ 4581 if (!VATTR_ALL_SUPPORTED(vap) && *vpp) { 4582 KAUTH_DEBUG(" CREATE - doing fallback with ACL %p", vap->va_acl); 4583 error = vnode_setattr_fallback(*vpp, vap, ctx); 4584 } 4585#if CONFIG_MACF 4586error: 4587#endif 4588 if ((error != 0 ) && (vp != (vnode_t)0)) { 4589 *vpp = (vnode_t) 0; 4590 vnode_put(vp); 4591 } 4592 4593out: 4594 /* 4595 * If the caller supplied a filesec in vap, it has been replaced 4596 * now by the post-inheritance copy. We need to put the original back 4597 * and free the inherited product. 4598 */ 4599 if (initial_acl) { 4600 VATTR_SET(vap, va_acl, oacl); 4601 } else { 4602 VATTR_CLEAR_ACTIVE(vap, va_acl); 4603 } 4604 if (nacl != NULL) 4605 kauth_acl_free(nacl); 4606 4607 return(error); 4608} 4609 4610static kauth_scope_t vnode_scope; 4611static int vnode_authorize_callback(kauth_cred_t credential, void *idata, kauth_action_t action, 4612 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3); 4613static int vnode_authorize_callback_int(__unused kauth_cred_t credential, __unused void *idata, kauth_action_t action, 4614 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3); 4615 4616typedef struct _vnode_authorize_context { 4617 vnode_t vp; 4618 struct vnode_attr *vap; 4619 vnode_t dvp; 4620 struct vnode_attr *dvap; 4621 vfs_context_t ctx; 4622 int flags; 4623 int flags_valid; 4624#define _VAC_IS_OWNER (1<<0) 4625#define _VAC_IN_GROUP (1<<1) 4626#define _VAC_IS_DIR_OWNER (1<<2) 4627#define _VAC_IN_DIR_GROUP (1<<3) 4628} *vauth_ctx; 4629 4630void 4631vnode_authorize_init(void) 4632{ 4633 vnode_scope = kauth_register_scope(KAUTH_SCOPE_VNODE, vnode_authorize_callback, NULL); 4634} 4635 4636/* 4637 * Authorize an operation on a vnode. 4638 * 4639 * This is KPI, but here because it needs vnode_scope. 4640 * 4641 * Returns: 0 Success 4642 * kauth_authorize_action:EPERM ... 4643 * xlate => EACCES Permission denied 4644 * kauth_authorize_action:0 Success 4645 * kauth_authorize_action: Depends on callback return; this is 4646 * usually only vnode_authorize_callback(), 4647 * but may include other listerners, if any 4648 * exist. 4649 * EROFS 4650 * EACCES 4651 * EPERM 4652 * ??? 4653 */ 4654int 4655vnode_authorize(vnode_t vp, vnode_t dvp, kauth_action_t action, vfs_context_t ctx) 4656{ 4657 int error, result; 4658 4659 /* 4660 * We can't authorize against a dead vnode; allow all operations through so that 4661 * the correct error can be returned. 4662 */ 4663 if (vp->v_type == VBAD) 4664 return(0); 4665 4666 error = 0; 4667 result = kauth_authorize_action(vnode_scope, vfs_context_ucred(ctx), action, 4668 (uintptr_t)ctx, (uintptr_t)vp, (uintptr_t)dvp, (uintptr_t)&error); 4669 if (result == EPERM) /* traditional behaviour */ 4670 result = EACCES; 4671 /* did the lower layers give a better error return? */ 4672 if ((result != 0) && (error != 0)) 4673 return(error); 4674 return(result); 4675} 4676 4677/* 4678 * Test for vnode immutability. 4679 * 4680 * The 'append' flag is set when the authorization request is constrained 4681 * to operations which only request the right to append to a file. 4682 * 4683 * The 'ignore' flag is set when an operation modifying the immutability flags 4684 * is being authorized. We check the system securelevel to determine which 4685 * immutability flags we can ignore. 4686 */ 4687static int 4688vnode_immutable(struct vnode_attr *vap, int append, int ignore) 4689{ 4690 int mask; 4691 4692 /* start with all bits precluding the operation */ 4693 mask = IMMUTABLE | APPEND; 4694 4695 /* if appending only, remove the append-only bits */ 4696 if (append) 4697 mask &= ~APPEND; 4698 4699 /* ignore only set when authorizing flags changes */ 4700 if (ignore) { 4701 if (securelevel <= 0) { 4702 /* in insecure state, flags do not inhibit changes */ 4703 mask = 0; 4704 } else { 4705 /* in secure state, user flags don't inhibit */ 4706 mask &= ~(UF_IMMUTABLE | UF_APPEND); 4707 } 4708 } 4709 KAUTH_DEBUG("IMMUTABLE - file flags 0x%x mask 0x%x append = %d ignore = %d", vap->va_flags, mask, append, ignore); 4710 if ((vap->va_flags & mask) != 0) 4711 return(EPERM); 4712 return(0); 4713} 4714 4715static int 4716vauth_node_owner(struct vnode_attr *vap, kauth_cred_t cred) 4717{ 4718 int result; 4719 4720 /* default assumption is not-owner */ 4721 result = 0; 4722 4723 /* 4724 * If the filesystem has given us a UID, we treat this as authoritative. 4725 */ 4726 if (vap && VATTR_IS_SUPPORTED(vap, va_uid)) { 4727 result = (vap->va_uid == kauth_cred_getuid(cred)) ? 1 : 0; 4728 } 4729 /* we could test the owner UUID here if we had a policy for it */ 4730 4731 return(result); 4732} 4733 4734static int 4735vauth_node_group(struct vnode_attr *vap, kauth_cred_t cred, int *ismember) 4736{ 4737 int error; 4738 int result; 4739 4740 error = 0; 4741 result = 0; 4742 4743 /* the caller is expected to have asked the filesystem for a group at some point */ 4744 if (vap && VATTR_IS_SUPPORTED(vap, va_gid)) { 4745 error = kauth_cred_ismember_gid(cred, vap->va_gid, &result); 4746 } 4747 /* we could test the group UUID here if we had a policy for it */ 4748 4749 if (!error) 4750 *ismember = result; 4751 return(error); 4752} 4753 4754static int 4755vauth_file_owner(vauth_ctx vcp) 4756{ 4757 int result; 4758 4759 if (vcp->flags_valid & _VAC_IS_OWNER) { 4760 result = (vcp->flags & _VAC_IS_OWNER) ? 1 : 0; 4761 } else { 4762 result = vauth_node_owner(vcp->vap, vcp->ctx->vc_ucred); 4763 4764 /* cache our result */ 4765 vcp->flags_valid |= _VAC_IS_OWNER; 4766 if (result) { 4767 vcp->flags |= _VAC_IS_OWNER; 4768 } else { 4769 vcp->flags &= ~_VAC_IS_OWNER; 4770 } 4771 } 4772 return(result); 4773} 4774 4775static int 4776vauth_file_ingroup(vauth_ctx vcp, int *ismember) 4777{ 4778 int error; 4779 4780 if (vcp->flags_valid & _VAC_IN_GROUP) { 4781 *ismember = (vcp->flags & _VAC_IN_GROUP) ? 1 : 0; 4782 error = 0; 4783 } else { 4784 error = vauth_node_group(vcp->vap, vcp->ctx->vc_ucred, ismember); 4785 4786 if (!error) { 4787 /* cache our result */ 4788 vcp->flags_valid |= _VAC_IN_GROUP; 4789 if (*ismember) { 4790 vcp->flags |= _VAC_IN_GROUP; 4791 } else { 4792 vcp->flags &= ~_VAC_IN_GROUP; 4793 } 4794 } 4795 4796 } 4797 return(error); 4798} 4799 4800static int 4801vauth_dir_owner(vauth_ctx vcp) 4802{ 4803 int result; 4804 4805 if (vcp->flags_valid & _VAC_IS_DIR_OWNER) { 4806 result = (vcp->flags & _VAC_IS_DIR_OWNER) ? 1 : 0; 4807 } else { 4808 result = vauth_node_owner(vcp->dvap, vcp->ctx->vc_ucred); 4809 4810 /* cache our result */ 4811 vcp->flags_valid |= _VAC_IS_DIR_OWNER; 4812 if (result) { 4813 vcp->flags |= _VAC_IS_DIR_OWNER; 4814 } else { 4815 vcp->flags &= ~_VAC_IS_DIR_OWNER; 4816 } 4817 } 4818 return(result); 4819} 4820 4821static int 4822vauth_dir_ingroup(vauth_ctx vcp, int *ismember) 4823{ 4824 int error; 4825 4826 if (vcp->flags_valid & _VAC_IN_DIR_GROUP) { 4827 *ismember = (vcp->flags & _VAC_IN_DIR_GROUP) ? 1 : 0; 4828 error = 0; 4829 } else { 4830 error = vauth_node_group(vcp->dvap, vcp->ctx->vc_ucred, ismember); 4831 4832 if (!error) { 4833 /* cache our result */ 4834 vcp->flags_valid |= _VAC_IN_DIR_GROUP; 4835 if (*ismember) { 4836 vcp->flags |= _VAC_IN_DIR_GROUP; 4837 } else { 4838 vcp->flags &= ~_VAC_IN_DIR_GROUP; 4839 } 4840 } 4841 } 4842 return(error); 4843} 4844 4845/* 4846 * Test the posix permissions in (vap) to determine whether (credential) 4847 * may perform (action) 4848 */ 4849static int 4850vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir) 4851{ 4852 struct vnode_attr *vap; 4853 int needed, error, owner_ok, group_ok, world_ok, ismember; 4854#ifdef KAUTH_DEBUG_ENABLE 4855 const char *where = "uninitialized"; 4856# define _SETWHERE(c) where = c; 4857#else 4858# define _SETWHERE(c) 4859#endif 4860 4861 /* checking file or directory? */ 4862 if (on_dir) { 4863 vap = vcp->dvap; 4864 } else { 4865 vap = vcp->vap; 4866 } 4867 4868 error = 0; 4869 4870 /* 4871 * We want to do as little work here as possible. So first we check 4872 * which sets of permissions grant us the access we need, and avoid checking 4873 * whether specific permissions grant access when more generic ones would. 4874 */ 4875 4876 /* owner permissions */ 4877 needed = 0; 4878 if (action & VREAD) 4879 needed |= S_IRUSR; 4880 if (action & VWRITE) 4881 needed |= S_IWUSR; 4882 if (action & VEXEC) 4883 needed |= S_IXUSR; 4884 owner_ok = (needed & vap->va_mode) == needed; 4885 4886 /* group permissions */ 4887 needed = 0; 4888 if (action & VREAD) 4889 needed |= S_IRGRP; 4890 if (action & VWRITE) 4891 needed |= S_IWGRP; 4892 if (action & VEXEC) 4893 needed |= S_IXGRP; 4894 group_ok = (needed & vap->va_mode) == needed; 4895 4896 /* world permissions */ 4897 needed = 0; 4898 if (action & VREAD) 4899 needed |= S_IROTH; 4900 if (action & VWRITE) 4901 needed |= S_IWOTH; 4902 if (action & VEXEC) 4903 needed |= S_IXOTH; 4904 world_ok = (needed & vap->va_mode) == needed; 4905 4906 /* If granted/denied by all three, we're done */ 4907 if (owner_ok && group_ok && world_ok) { 4908 _SETWHERE("all"); 4909 goto out; 4910 } 4911 if (!owner_ok && !group_ok && !world_ok) { 4912 _SETWHERE("all"); 4913 error = EACCES; 4914 goto out; 4915 } 4916 4917 /* Check ownership (relatively cheap) */ 4918 if ((on_dir && vauth_dir_owner(vcp)) || 4919 (!on_dir && vauth_file_owner(vcp))) { 4920 _SETWHERE("user"); 4921 if (!owner_ok) 4922 error = EACCES; 4923 goto out; 4924 } 4925 4926 /* Not owner; if group and world both grant it we're done */ 4927 if (group_ok && world_ok) { 4928 _SETWHERE("group/world"); 4929 goto out; 4930 } 4931 if (!group_ok && !world_ok) { 4932 _SETWHERE("group/world"); 4933 error = EACCES; 4934 goto out; 4935 } 4936 4937 /* Check group membership (most expensive) */ 4938 ismember = 0; 4939 if (on_dir) { 4940 error = vauth_dir_ingroup(vcp, &ismember); 4941 } else { 4942 error = vauth_file_ingroup(vcp, &ismember); 4943 } 4944 if (error) 4945 goto out; 4946 if (ismember) { 4947 _SETWHERE("group"); 4948 if (!group_ok) 4949 error = EACCES; 4950 goto out; 4951 } 4952 4953 /* Not owner, not in group, use world result */ 4954 _SETWHERE("world"); 4955 if (!world_ok) 4956 error = EACCES; 4957 4958 /* FALLTHROUGH */ 4959 4960out: 4961 KAUTH_DEBUG("%p %s - posix %s permissions : need %s%s%s %x have %s%s%s%s%s%s%s%s%s UID = %d file = %d,%d", 4962 vcp->vp, (error == 0) ? "ALLOWED" : "DENIED", where, 4963 (action & VREAD) ? "r" : "-", 4964 (action & VWRITE) ? "w" : "-", 4965 (action & VEXEC) ? "x" : "-", 4966 needed, 4967 (vap->va_mode & S_IRUSR) ? "r" : "-", 4968 (vap->va_mode & S_IWUSR) ? "w" : "-", 4969 (vap->va_mode & S_IXUSR) ? "x" : "-", 4970 (vap->va_mode & S_IRGRP) ? "r" : "-", 4971 (vap->va_mode & S_IWGRP) ? "w" : "-", 4972 (vap->va_mode & S_IXGRP) ? "x" : "-", 4973 (vap->va_mode & S_IROTH) ? "r" : "-", 4974 (vap->va_mode & S_IWOTH) ? "w" : "-", 4975 (vap->va_mode & S_IXOTH) ? "x" : "-", 4976 kauth_cred_getuid(vcp->ctx->vc_ucred), 4977 on_dir ? vcp->dvap->va_uid : vcp->vap->va_uid, 4978 on_dir ? vcp->dvap->va_gid : vcp->vap->va_gid); 4979 return(error); 4980} 4981 4982/* 4983 * Authorize the deletion of the node vp from the directory dvp. 4984 * 4985 * We assume that: 4986 * - Neither the node nor the directory are immutable. 4987 * - The user is not the superuser. 4988 * 4989 * Deletion is not permitted if the directory is sticky and the caller is 4990 * not owner of the node or directory. 4991 * 4992 * If either the node grants DELETE, or the directory grants DELETE_CHILD, 4993 * the node may be deleted. If neither denies the permission, and the 4994 * caller has Posix write access to the directory, then the node may be 4995 * deleted. 4996 * 4997 * As an optimization, we cache whether or not delete child is permitted 4998 * on directories without the sticky bit set. 4999 */ 5000int 5001vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child); 5002/*static*/ int 5003vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) 5004{ 5005 struct vnode_attr *vap = vcp->vap; 5006 struct vnode_attr *dvap = vcp->dvap; 5007 kauth_cred_t cred = vcp->ctx->vc_ucred; 5008 struct kauth_acl_eval eval; 5009 int error, delete_denied, delete_child_denied, ismember; 5010 5011 /* check the ACL on the directory */ 5012 delete_child_denied = 0; 5013 if (!cached_delete_child && VATTR_IS_NOT(dvap, va_acl, NULL)) { 5014 eval.ae_requested = KAUTH_VNODE_DELETE_CHILD; 5015 eval.ae_acl = &dvap->va_acl->acl_ace[0]; 5016 eval.ae_count = dvap->va_acl->acl_entrycount; 5017 eval.ae_options = 0; 5018 if (vauth_dir_owner(vcp)) 5019 eval.ae_options |= KAUTH_AEVAL_IS_OWNER; 5020 if ((error = vauth_dir_ingroup(vcp, &ismember)) != 0) 5021 return(error); 5022 if (ismember) 5023 eval.ae_options |= KAUTH_AEVAL_IN_GROUP; 5024 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; 5025 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; 5026 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS; 5027 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS; 5028 5029 error = kauth_acl_evaluate(cred, &eval); 5030 5031 if (error != 0) { 5032 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error); 5033 return(error); 5034 } 5035 if (eval.ae_result == KAUTH_RESULT_DENY) 5036 delete_child_denied = 1; 5037 if (eval.ae_result == KAUTH_RESULT_ALLOW) { 5038 KAUTH_DEBUG("%p ALLOWED - granted by directory ACL", vcp->vp); 5039 return(0); 5040 } 5041 } 5042 5043 /* check the ACL on the node */ 5044 delete_denied = 0; 5045 if (VATTR_IS_NOT(vap, va_acl, NULL)) { 5046 eval.ae_requested = KAUTH_VNODE_DELETE; 5047 eval.ae_acl = &vap->va_acl->acl_ace[0]; 5048 eval.ae_count = vap->va_acl->acl_entrycount; 5049 eval.ae_options = 0; 5050 if (vauth_file_owner(vcp)) 5051 eval.ae_options |= KAUTH_AEVAL_IS_OWNER; 5052 if ((error = vauth_file_ingroup(vcp, &ismember)) != 0) 5053 return(error); 5054 if (ismember) 5055 eval.ae_options |= KAUTH_AEVAL_IN_GROUP; 5056 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; 5057 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; 5058 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS; 5059 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS; 5060 5061 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) { 5062 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error); 5063 return(error); 5064 } 5065 if (eval.ae_result == KAUTH_RESULT_DENY) 5066 delete_denied = 1; 5067 if (eval.ae_result == KAUTH_RESULT_ALLOW) { 5068 KAUTH_DEBUG("%p ALLOWED - granted by file ACL", vcp->vp); 5069 return(0); 5070 } 5071 } 5072 5073 /* if denied by ACL on directory or node, return denial */ 5074 if (delete_denied || delete_child_denied) { 5075 KAUTH_DEBUG("%p ALLOWED - denied by ACL", vcp->vp); 5076 return(EACCES); 5077 } 5078 5079 /* 5080 * enforce sticky bit behaviour; the cached_delete_child property will 5081 * be false and the dvap contents valis for sticky bit directories; 5082 * this makes us check the directory each time, but it's unavoidable, 5083 * as sticky bit is an exception to caching. 5084 */ 5085 if (!cached_delete_child && (dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) { 5086 KAUTH_DEBUG("%p DENIED - sticky bit rules (user %d file %d dir %d)", 5087 vcp->vp, cred->cr_uid, vap->va_uid, dvap->va_uid); 5088 return(EACCES); 5089 } 5090 5091 /* check the directory */ 5092 if (!cached_delete_child && (error = vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */)) != 0) { 5093 KAUTH_DEBUG("%p ALLOWED - granted by posix permisssions", vcp->vp); 5094 return(error); 5095 } 5096 5097 /* not denied, must be OK */ 5098 return(0); 5099} 5100 5101 5102/* 5103 * Authorize an operation based on the node's attributes. 5104 */ 5105static int 5106vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_rights_t preauth_rights, boolean_t *found_deny) 5107{ 5108 struct vnode_attr *vap = vcp->vap; 5109 kauth_cred_t cred = vcp->ctx->vc_ucred; 5110 struct kauth_acl_eval eval; 5111 int error, ismember; 5112 mode_t posix_action; 5113 5114 /* 5115 * If we are the file owner, we automatically have some rights. 5116 * 5117 * Do we need to expand this to support group ownership? 5118 */ 5119 if (vauth_file_owner(vcp)) 5120 acl_rights &= ~(KAUTH_VNODE_WRITE_SECURITY); 5121 5122 /* 5123 * If we are checking both TAKE_OWNERSHIP and WRITE_SECURITY, we can 5124 * mask the latter. If TAKE_OWNERSHIP is requested the caller is about to 5125 * change ownership to themselves, and WRITE_SECURITY is implicitly 5126 * granted to the owner. We need to do this because at this point 5127 * WRITE_SECURITY may not be granted as the caller is not currently 5128 * the owner. 5129 */ 5130 if ((acl_rights & KAUTH_VNODE_TAKE_OWNERSHIP) && 5131 (acl_rights & KAUTH_VNODE_WRITE_SECURITY)) 5132 acl_rights &= ~KAUTH_VNODE_WRITE_SECURITY; 5133 5134 if (acl_rights == 0) { 5135 KAUTH_DEBUG("%p ALLOWED - implicit or no rights required", vcp->vp); 5136 return(0); 5137 } 5138 5139 /* if we have an ACL, evaluate it */ 5140 if (VATTR_IS_NOT(vap, va_acl, NULL)) { 5141 eval.ae_requested = acl_rights; 5142 eval.ae_acl = &vap->va_acl->acl_ace[0]; 5143 eval.ae_count = vap->va_acl->acl_entrycount; 5144 eval.ae_options = 0; 5145 if (vauth_file_owner(vcp)) 5146 eval.ae_options |= KAUTH_AEVAL_IS_OWNER; 5147 if ((error = vauth_file_ingroup(vcp, &ismember)) != 0) 5148 return(error); 5149 if (ismember) 5150 eval.ae_options |= KAUTH_AEVAL_IN_GROUP; 5151 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; 5152 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; 5153 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS; 5154 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS; 5155 5156 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) { 5157 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error); 5158 return(error); 5159 } 5160 5161 if (eval.ae_result == KAUTH_RESULT_DENY) { 5162 KAUTH_DEBUG("%p DENIED - by ACL", vcp->vp); 5163 return(EACCES); /* deny, deny, counter-allege */ 5164 } 5165 if (eval.ae_result == KAUTH_RESULT_ALLOW) { 5166 KAUTH_DEBUG("%p ALLOWED - all rights granted by ACL", vcp->vp); 5167 return(0); 5168 } 5169 *found_deny = eval.ae_found_deny; 5170 5171 /* fall through and evaluate residual rights */ 5172 } else { 5173 /* no ACL, everything is residual */ 5174 eval.ae_residual = acl_rights; 5175 } 5176 5177 /* 5178 * Grant residual rights that have been pre-authorized. 5179 */ 5180 eval.ae_residual &= ~preauth_rights; 5181 5182 /* 5183 * We grant WRITE_ATTRIBUTES to the owner if it hasn't been denied. 5184 */ 5185 if (vauth_file_owner(vcp)) 5186 eval.ae_residual &= ~KAUTH_VNODE_WRITE_ATTRIBUTES; 5187 5188 if (eval.ae_residual == 0) { 5189 KAUTH_DEBUG("%p ALLOWED - rights already authorized", vcp->vp); 5190 return(0); 5191 } 5192 5193 /* 5194 * Bail if we have residual rights that can't be granted by posix permissions, 5195 * or aren't presumed granted at this point. 5196 * 5197 * XXX these can be collapsed for performance 5198 */ 5199 if (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) { 5200 KAUTH_DEBUG("%p DENIED - CHANGE_OWNER not permitted", vcp->vp); 5201 return(EACCES); 5202 } 5203 if (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) { 5204 KAUTH_DEBUG("%p DENIED - WRITE_SECURITY not permitted", vcp->vp); 5205 return(EACCES); 5206 } 5207 5208#if DIAGNOSTIC 5209 if (eval.ae_residual & KAUTH_VNODE_DELETE) 5210 panic("vnode_authorize: can't be checking delete permission here"); 5211#endif 5212 5213 /* 5214 * Compute the fallback posix permissions that will satisfy the remaining 5215 * rights. 5216 */ 5217 posix_action = 0; 5218 if (eval.ae_residual & (KAUTH_VNODE_READ_DATA | 5219 KAUTH_VNODE_LIST_DIRECTORY | 5220 KAUTH_VNODE_READ_EXTATTRIBUTES)) 5221 posix_action |= VREAD; 5222 if (eval.ae_residual & (KAUTH_VNODE_WRITE_DATA | 5223 KAUTH_VNODE_ADD_FILE | 5224 KAUTH_VNODE_ADD_SUBDIRECTORY | 5225 KAUTH_VNODE_DELETE_CHILD | 5226 KAUTH_VNODE_WRITE_ATTRIBUTES | 5227 KAUTH_VNODE_WRITE_EXTATTRIBUTES)) 5228 posix_action |= VWRITE; 5229 if (eval.ae_residual & (KAUTH_VNODE_EXECUTE | 5230 KAUTH_VNODE_SEARCH)) 5231 posix_action |= VEXEC; 5232 5233 if (posix_action != 0) { 5234 return(vnode_authorize_posix(vcp, posix_action, 0 /* !on_dir */)); 5235 } else { 5236 KAUTH_DEBUG("%p ALLOWED - residual rights %s%s%s%s%s%s%s%s%s%s%s%s%s%s granted due to no posix mapping", 5237 vcp->vp, 5238 (eval.ae_residual & KAUTH_VNODE_READ_DATA) 5239 ? vnode_isdir(vcp->vp) ? " LIST_DIRECTORY" : " READ_DATA" : "", 5240 (eval.ae_residual & KAUTH_VNODE_WRITE_DATA) 5241 ? vnode_isdir(vcp->vp) ? " ADD_FILE" : " WRITE_DATA" : "", 5242 (eval.ae_residual & KAUTH_VNODE_EXECUTE) 5243 ? vnode_isdir(vcp->vp) ? " SEARCH" : " EXECUTE" : "", 5244 (eval.ae_residual & KAUTH_VNODE_DELETE) 5245 ? " DELETE" : "", 5246 (eval.ae_residual & KAUTH_VNODE_APPEND_DATA) 5247 ? vnode_isdir(vcp->vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "", 5248 (eval.ae_residual & KAUTH_VNODE_DELETE_CHILD) 5249 ? " DELETE_CHILD" : "", 5250 (eval.ae_residual & KAUTH_VNODE_READ_ATTRIBUTES) 5251 ? " READ_ATTRIBUTES" : "", 5252 (eval.ae_residual & KAUTH_VNODE_WRITE_ATTRIBUTES) 5253 ? " WRITE_ATTRIBUTES" : "", 5254 (eval.ae_residual & KAUTH_VNODE_READ_EXTATTRIBUTES) 5255 ? " READ_EXTATTRIBUTES" : "", 5256 (eval.ae_residual & KAUTH_VNODE_WRITE_EXTATTRIBUTES) 5257 ? " WRITE_EXTATTRIBUTES" : "", 5258 (eval.ae_residual & KAUTH_VNODE_READ_SECURITY) 5259 ? " READ_SECURITY" : "", 5260 (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) 5261 ? " WRITE_SECURITY" : "", 5262 (eval.ae_residual & KAUTH_VNODE_CHECKIMMUTABLE) 5263 ? " CHECKIMMUTABLE" : "", 5264 (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) 5265 ? " CHANGE_OWNER" : ""); 5266 } 5267 5268 /* 5269 * Lack of required Posix permissions implies no reason to deny access. 5270 */ 5271 return(0); 5272} 5273 5274/* 5275 * Check for file immutability. 5276 */ 5277static int 5278vnode_authorize_checkimmutable(vnode_t vp, struct vnode_attr *vap, int rights, int ignore) 5279{ 5280 mount_t mp; 5281 int error; 5282 int append; 5283 5284 /* 5285 * Perform immutability checks for operations that change data. 5286 * 5287 * Sockets, fifos and devices require special handling. 5288 */ 5289 switch(vp->v_type) { 5290 case VSOCK: 5291 case VFIFO: 5292 case VBLK: 5293 case VCHR: 5294 /* 5295 * Writing to these nodes does not change the filesystem data, 5296 * so forget that it's being tried. 5297 */ 5298 rights &= ~KAUTH_VNODE_WRITE_DATA; 5299 break; 5300 default: 5301 break; 5302 } 5303 5304 error = 0; 5305 if (rights & KAUTH_VNODE_WRITE_RIGHTS) { 5306 5307 /* check per-filesystem options if possible */ 5308 mp = vp->v_mount; 5309 if (mp != NULL) { 5310 5311 /* check for no-EA filesystems */ 5312 if ((rights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) && 5313 (vfs_flags(mp) & MNT_NOUSERXATTR)) { 5314 KAUTH_DEBUG("%p DENIED - filesystem disallowed extended attributes", vp); 5315 error = EACCES; /* User attributes disabled */ 5316 goto out; 5317 } 5318 } 5319 5320 /* check for file immutability */ 5321 append = 0; 5322 if (vp->v_type == VDIR) { 5323 if ((rights & (KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY)) == rights) 5324 append = 1; 5325 } else { 5326 if ((rights & KAUTH_VNODE_APPEND_DATA) == rights) 5327 append = 1; 5328 } 5329 if ((error = vnode_immutable(vap, append, ignore)) != 0) { 5330 KAUTH_DEBUG("%p DENIED - file is immutable", vp); 5331 goto out; 5332 } 5333 } 5334out: 5335 return(error); 5336} 5337 5338/* 5339 * Handle authorization actions for filesystems that advertise that the 5340 * server will be enforcing. 5341 * 5342 * Returns: 0 Authorization should be handled locally 5343 * 1 Authorization was handled by the FS 5344 * 5345 * Note: Imputed returns will only occur if the authorization request 5346 * was handled by the FS. 5347 * 5348 * Imputed: *resultp, modified Return code from FS when the request is 5349 * handled by the FS. 5350 * VNOP_ACCESS:??? 5351 * VNOP_OPEN:??? 5352 */ 5353static int 5354vnode_authorize_opaque(vnode_t vp, int *resultp, kauth_action_t action, vfs_context_t ctx) 5355{ 5356 int error; 5357 5358 /* 5359 * If the vp is a device node, socket or FIFO it actually represents a local 5360 * endpoint, so we need to handle it locally. 5361 */ 5362 switch(vp->v_type) { 5363 case VBLK: 5364 case VCHR: 5365 case VSOCK: 5366 case VFIFO: 5367 return(0); 5368 default: 5369 break; 5370 } 5371 5372 /* 5373 * In the advisory request case, if the filesystem doesn't think it's reliable 5374 * we will attempt to formulate a result ourselves based on VNOP_GETATTR data. 5375 */ 5376 if ((action & KAUTH_VNODE_ACCESS) && !vfs_authopaqueaccess(vp->v_mount)) 5377 return(0); 5378 5379 /* 5380 * Let the filesystem have a say in the matter. It's OK for it to not implemnent 5381 * VNOP_ACCESS, as most will authorise inline with the actual request. 5382 */ 5383 if ((error = VNOP_ACCESS(vp, action, ctx)) != ENOTSUP) { 5384 *resultp = error; 5385 KAUTH_DEBUG("%p DENIED - opaque filesystem VNOP_ACCESS denied access", vp); 5386 return(1); 5387 } 5388 5389 /* 5390 * Typically opaque filesystems do authorisation in-line, but exec is a special case. In 5391 * order to be reasonably sure that exec will be permitted, we try a bit harder here. 5392 */ 5393 if ((action & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG)) { 5394 /* try a VNOP_OPEN for readonly access */ 5395 if ((error = VNOP_OPEN(vp, FREAD, ctx)) != 0) { 5396 *resultp = error; 5397 KAUTH_DEBUG("%p DENIED - EXECUTE denied because file could not be opened readonly", vp); 5398 return(1); 5399 } 5400 VNOP_CLOSE(vp, FREAD, ctx); 5401 } 5402 5403 /* 5404 * We don't have any reason to believe that the request has to be denied at this point, 5405 * so go ahead and allow it. 5406 */ 5407 *resultp = 0; 5408 KAUTH_DEBUG("%p ALLOWED - bypassing access check for non-local filesystem", vp); 5409 return(1); 5410} 5411 5412 5413 5414 5415/* 5416 * Returns: KAUTH_RESULT_ALLOW 5417 * KAUTH_RESULT_DENY 5418 * 5419 * Imputed: *arg3, modified Error code in the deny case 5420 * EROFS Read-only file system 5421 * EACCES Permission denied 5422 * EPERM Operation not permitted [no execute] 5423 * vnode_getattr:ENOMEM Not enough space [only if has filesec] 5424 * vnode_getattr:??? 5425 * vnode_authorize_opaque:*arg2 ??? 5426 * vnode_authorize_checkimmutable:??? 5427 * vnode_authorize_delete:??? 5428 * vnode_authorize_simple:??? 5429 */ 5430 5431 5432static int 5433vnode_authorize_callback(kauth_cred_t cred, void *idata, kauth_action_t action, 5434 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3) 5435{ 5436 vfs_context_t ctx; 5437 vnode_t cvp = NULLVP; 5438 vnode_t vp, dvp; 5439 int result; 5440 5441 ctx = (vfs_context_t)arg0; 5442 vp = (vnode_t)arg1; 5443 dvp = (vnode_t)arg2; 5444 5445 /* 5446 * if there are 2 vnodes passed in, we don't know at 5447 * this point which rights to look at based on the 5448 * combined action being passed in... defer until later... 5449 * otherwise check the kauth 'rights' cache hung 5450 * off of the vnode we're interested in... if we've already 5451 * been granted the right we're currently interested in, 5452 * we can just return success... otherwise we'll go through 5453 * the process of authorizing the requested right(s)... if that 5454 * succeeds, we'll add the right(s) to the cache. 5455 * VNOP_SETATTR and VNOP_SETXATTR will invalidate this cache 5456 */ 5457 if (dvp && vp) 5458 goto defer; 5459 if (dvp) 5460 cvp = dvp; 5461 else 5462 cvp = vp; 5463 5464 if (vnode_cache_is_authorized(cvp, ctx, action) == TRUE) 5465 return KAUTH_RESULT_ALLOW; 5466defer: 5467 result = vnode_authorize_callback_int(cred, idata, action, arg0, arg1, arg2, arg3); 5468 5469 if (result == KAUTH_RESULT_ALLOW && cvp != NULLVP) 5470 vnode_cache_authorized_action(cvp, ctx, action); 5471 5472 return result; 5473} 5474 5475 5476static int 5477vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *idata, kauth_action_t action, 5478 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3) 5479{ 5480 struct _vnode_authorize_context auth_context; 5481 vauth_ctx vcp; 5482 vfs_context_t ctx; 5483 vnode_t vp, dvp; 5484 kauth_cred_t cred; 5485 kauth_ace_rights_t rights; 5486 struct vnode_attr va, dva; 5487 int result; 5488 int *errorp; 5489 int noimmutable; 5490 boolean_t parent_authorized_for_delete_child = FALSE; 5491 boolean_t found_deny = FALSE; 5492 boolean_t parent_ref= FALSE; 5493 5494 vcp = &auth_context; 5495 ctx = vcp->ctx = (vfs_context_t)arg0; 5496 vp = vcp->vp = (vnode_t)arg1; 5497 dvp = vcp->dvp = (vnode_t)arg2; 5498 errorp = (int *)arg3; 5499 /* 5500 * Note that we authorize against the context, not the passed cred 5501 * (the same thing anyway) 5502 */ 5503 cred = ctx->vc_ucred; 5504 5505 VATTR_INIT(&va); 5506 vcp->vap = &va; 5507 VATTR_INIT(&dva); 5508 vcp->dvap = &dva; 5509 5510 vcp->flags = vcp->flags_valid = 0; 5511 5512#if DIAGNOSTIC 5513 if ((ctx == NULL) || (vp == NULL) || (cred == NULL)) 5514 panic("vnode_authorize: bad arguments (context %p vp %p cred %p)", ctx, vp, cred); 5515#endif 5516 5517 KAUTH_DEBUG("%p AUTH - %s %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s on %s '%s' (0x%x:%p/%p)", 5518 vp, vfs_context_proc(ctx)->p_comm, 5519 (action & KAUTH_VNODE_ACCESS) ? "access" : "auth", 5520 (action & KAUTH_VNODE_READ_DATA) ? vnode_isdir(vp) ? " LIST_DIRECTORY" : " READ_DATA" : "", 5521 (action & KAUTH_VNODE_WRITE_DATA) ? vnode_isdir(vp) ? " ADD_FILE" : " WRITE_DATA" : "", 5522 (action & KAUTH_VNODE_EXECUTE) ? vnode_isdir(vp) ? " SEARCH" : " EXECUTE" : "", 5523 (action & KAUTH_VNODE_DELETE) ? " DELETE" : "", 5524 (action & KAUTH_VNODE_APPEND_DATA) ? vnode_isdir(vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "", 5525 (action & KAUTH_VNODE_DELETE_CHILD) ? " DELETE_CHILD" : "", 5526 (action & KAUTH_VNODE_READ_ATTRIBUTES) ? " READ_ATTRIBUTES" : "", 5527 (action & KAUTH_VNODE_WRITE_ATTRIBUTES) ? " WRITE_ATTRIBUTES" : "", 5528 (action & KAUTH_VNODE_READ_EXTATTRIBUTES) ? " READ_EXTATTRIBUTES" : "", 5529 (action & KAUTH_VNODE_WRITE_EXTATTRIBUTES) ? " WRITE_EXTATTRIBUTES" : "", 5530 (action & KAUTH_VNODE_READ_SECURITY) ? " READ_SECURITY" : "", 5531 (action & KAUTH_VNODE_WRITE_SECURITY) ? " WRITE_SECURITY" : "", 5532 (action & KAUTH_VNODE_CHANGE_OWNER) ? " CHANGE_OWNER" : "", 5533 (action & KAUTH_VNODE_NOIMMUTABLE) ? " (noimmutable)" : "", 5534 vnode_isdir(vp) ? "directory" : "file", 5535 vp->v_name ? vp->v_name : "<NULL>", action, vp, dvp); 5536 5537 /* 5538 * Extract the control bits from the action, everything else is 5539 * requested rights. 5540 */ 5541 noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0; 5542 rights = action & ~(KAUTH_VNODE_ACCESS | KAUTH_VNODE_NOIMMUTABLE); 5543 5544 if (rights & KAUTH_VNODE_DELETE) { 5545#if DIAGNOSTIC 5546 if (dvp == NULL) 5547 panic("vnode_authorize: KAUTH_VNODE_DELETE test requires a directory"); 5548#endif 5549 /* 5550 * check to see if we've already authorized the parent 5551 * directory for deletion of its children... if so, we 5552 * can skip a whole bunch of work... we will still have to 5553 * authorize that this specific child can be removed 5554 */ 5555 if (vnode_cache_is_authorized(dvp, ctx, KAUTH_VNODE_DELETE_CHILD) == TRUE) 5556 parent_authorized_for_delete_child = TRUE; 5557 } else { 5558 dvp = NULL; 5559 } 5560 5561 /* 5562 * Check for read-only filesystems. 5563 */ 5564 if ((rights & KAUTH_VNODE_WRITE_RIGHTS) && 5565 (vp->v_mount->mnt_flag & MNT_RDONLY) && 5566 ((vp->v_type == VREG) || (vp->v_type == VDIR) || 5567 (vp->v_type == VLNK) || (vp->v_type == VCPLX) || 5568 (rights & KAUTH_VNODE_DELETE) || (rights & KAUTH_VNODE_DELETE_CHILD))) { 5569 result = EROFS; 5570 goto out; 5571 } 5572 5573 /* 5574 * Check for noexec filesystems. 5575 */ 5576 if ((rights & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG) && (vp->v_mount->mnt_flag & MNT_NOEXEC)) { 5577 result = EACCES; 5578 goto out; 5579 } 5580 5581 /* 5582 * Handle cases related to filesystems with non-local enforcement. 5583 * This call can return 0, in which case we will fall through to perform a 5584 * check based on VNOP_GETATTR data. Otherwise it returns 1 and sets 5585 * an appropriate result, at which point we can return immediately. 5586 */ 5587 if ((vp->v_mount->mnt_kern_flag & MNTK_AUTH_OPAQUE) && vnode_authorize_opaque(vp, &result, action, ctx)) 5588 goto out; 5589 5590 /* 5591 * Get vnode attributes and extended security information for the vnode 5592 * and directory if required. 5593 */ 5594 VATTR_WANTED(&va, va_mode); 5595 VATTR_WANTED(&va, va_uid); 5596 VATTR_WANTED(&va, va_gid); 5597 VATTR_WANTED(&va, va_flags); 5598 VATTR_WANTED(&va, va_acl); 5599 if ((result = vnode_getattr(vp, &va, ctx)) != 0) { 5600 KAUTH_DEBUG("%p ERROR - failed to get vnode attributes - %d", vp, result); 5601 goto out; 5602 } 5603 if (dvp && parent_authorized_for_delete_child == FALSE) { 5604 VATTR_WANTED(&dva, va_mode); 5605 VATTR_WANTED(&dva, va_uid); 5606 VATTR_WANTED(&dva, va_gid); 5607 VATTR_WANTED(&dva, va_flags); 5608 VATTR_WANTED(&dva, va_acl); 5609 if ((result = vnode_getattr(dvp, &dva, ctx)) != 0) { 5610 KAUTH_DEBUG("%p ERROR - failed to get directory vnode attributes - %d", vp, result); 5611 goto out; 5612 } 5613 } 5614 5615 /* 5616 * If the vnode is an extended attribute data vnode (eg. a resource fork), *_DATA becomes 5617 * *_EXTATTRIBUTES. 5618 */ 5619 if (S_ISXATTR(va.va_mode) || vnode_isnamedstream(vp)) { 5620 if (rights & KAUTH_VNODE_READ_DATA) { 5621 rights &= ~KAUTH_VNODE_READ_DATA; 5622 rights |= KAUTH_VNODE_READ_EXTATTRIBUTES; 5623 } 5624 if (rights & KAUTH_VNODE_WRITE_DATA) { 5625 rights &= ~KAUTH_VNODE_WRITE_DATA; 5626 rights |= KAUTH_VNODE_WRITE_EXTATTRIBUTES; 5627 } 5628 } 5629 5630 /* 5631 * Point 'vp' to the resource fork's parent for ACL checking 5632 */ 5633 if (vnode_isnamedstream(vp) && 5634 (vp->v_parent != NULL) && 5635 (vget_internal(vp->v_parent, 0, VNODE_NODEAD) == 0)) { 5636 parent_ref = TRUE; 5637 vcp->vp = vp = vp->v_parent; 5638 if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) 5639 kauth_acl_free(va.va_acl); 5640 VATTR_INIT(&va); 5641 VATTR_WANTED(&va, va_mode); 5642 VATTR_WANTED(&va, va_uid); 5643 VATTR_WANTED(&va, va_gid); 5644 VATTR_WANTED(&va, va_flags); 5645 VATTR_WANTED(&va, va_acl); 5646 if ((result = vnode_getattr(vp, &va, ctx)) != 0) 5647 goto out; 5648 } 5649 5650 /* 5651 * Check for immutability. 5652 * 5653 * In the deletion case, parent directory immutability vetoes specific 5654 * file rights. 5655 */ 5656 if ((result = vnode_authorize_checkimmutable(vp, &va, rights, noimmutable)) != 0) 5657 goto out; 5658 if ((rights & KAUTH_VNODE_DELETE) && 5659 parent_authorized_for_delete_child == FALSE && 5660 ((result = vnode_authorize_checkimmutable(dvp, &dva, KAUTH_VNODE_DELETE_CHILD, 0)) != 0)) 5661 goto out; 5662 5663 /* 5664 * Clear rights that have been authorized by reaching this point, bail if nothing left to 5665 * check. 5666 */ 5667 rights &= ~(KAUTH_VNODE_LINKTARGET | KAUTH_VNODE_CHECKIMMUTABLE); 5668 if (rights == 0) 5669 goto out; 5670 5671 /* 5672 * If we're not the superuser, authorize based on file properties; 5673 * note that even if parent_authorized_for_delete_child is TRUE, we 5674 * need to check on the node itself. 5675 */ 5676 if (!vfs_context_issuser(ctx)) { 5677 /* process delete rights */ 5678 if ((rights & KAUTH_VNODE_DELETE) && 5679 ((result = vnode_authorize_delete(vcp, parent_authorized_for_delete_child)) != 0)) 5680 goto out; 5681 5682 /* process remaining rights */ 5683 if ((rights & ~KAUTH_VNODE_DELETE) && 5684 (result = vnode_authorize_simple(vcp, rights, rights & KAUTH_VNODE_DELETE, &found_deny)) != 0) 5685 goto out; 5686 } else { 5687 5688 /* 5689 * Execute is only granted to root if one of the x bits is set. This check only 5690 * makes sense if the posix mode bits are actually supported. 5691 */ 5692 if ((rights & KAUTH_VNODE_EXECUTE) && 5693 (vp->v_type == VREG) && 5694 VATTR_IS_SUPPORTED(&va, va_mode) && 5695 !(va.va_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) { 5696 result = EPERM; 5697 KAUTH_DEBUG("%p DENIED - root execute requires at least one x bit in 0x%x", vp, va.va_mode); 5698 goto out; 5699 } 5700 5701 KAUTH_DEBUG("%p ALLOWED - caller is superuser", vp); 5702 } 5703out: 5704 if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) 5705 kauth_acl_free(va.va_acl); 5706 if (VATTR_IS_SUPPORTED(&dva, va_acl) && (dva.va_acl != NULL)) 5707 kauth_acl_free(dva.va_acl); 5708 5709 if (result) { 5710 if (parent_ref) 5711 vnode_put(vp); 5712 *errorp = result; 5713 KAUTH_DEBUG("%p DENIED - auth denied", vp); 5714 return(KAUTH_RESULT_DENY); 5715 } 5716 if ((rights & KAUTH_VNODE_SEARCH) && found_deny == FALSE && vp->v_type == VDIR) { 5717 /* 5718 * if we were successfully granted the right to search this directory 5719 * and there were NO ACL DENYs for search and the posix permissions also don't 5720 * deny execute, we can synthesize a global right that allows anyone to 5721 * traverse this directory during a pathname lookup without having to 5722 * match the credential associated with this cache of rights. 5723 */ 5724 if (!VATTR_IS_SUPPORTED(&va, va_mode) || 5725 ((va.va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 5726 (S_IXUSR | S_IXGRP | S_IXOTH))) { 5727 vnode_cache_authorized_action(vp, ctx, KAUTH_VNODE_SEARCHBYANYONE); 5728 } 5729 } 5730 if ((rights & KAUTH_VNODE_DELETE) && parent_authorized_for_delete_child == FALSE) { 5731 /* 5732 * parent was successfully and newly authorized for content deletions 5733 * add it to the cache, but only if it doesn't have the sticky 5734 * bit set on it. This same check is done earlier guarding 5735 * fetching of dva, and if we jumped to out without having done 5736 * this, we will have returned already because of a non-zero 5737 * 'result' value. 5738 */ 5739 if (VATTR_IS_SUPPORTED(&dva, va_mode) && 5740 !(dva.va_mode & (S_ISVTX))) { 5741 /* OK to cache delete rights */ 5742 vnode_cache_authorized_action(dvp, ctx, KAUTH_VNODE_DELETE_CHILD); 5743 } 5744 } 5745 if (parent_ref) 5746 vnode_put(vp); 5747 /* 5748 * Note that this implies that we will allow requests for no rights, as well as 5749 * for rights that we do not recognise. There should be none of these. 5750 */ 5751 KAUTH_DEBUG("%p ALLOWED - auth granted", vp); 5752 return(KAUTH_RESULT_ALLOW); 5753} 5754 5755/* 5756 * Check that the attribute information in vattr can be legally applied to 5757 * a new file by the context. 5758 */ 5759int 5760vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_t ctx) 5761{ 5762 int error; 5763 int has_priv_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode; 5764 kauth_cred_t cred; 5765 guid_t changer; 5766 mount_t dmp; 5767 5768 error = 0; 5769 defaulted_owner = defaulted_group = defaulted_mode = 0; 5770 5771 /* 5772 * Require that the filesystem support extended security to apply any. 5773 */ 5774 if (!vfs_extendedsecurity(dvp->v_mount) && 5775 (VATTR_IS_ACTIVE(vap, va_acl) || VATTR_IS_ACTIVE(vap, va_uuuid) || VATTR_IS_ACTIVE(vap, va_guuid))) { 5776 error = EINVAL; 5777 goto out; 5778 } 5779 5780 /* 5781 * Default some fields. 5782 */ 5783 dmp = dvp->v_mount; 5784 5785 /* 5786 * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit owner is set, that 5787 * owner takes ownership of all new files. 5788 */ 5789 if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsowner != KAUTH_UID_NONE)) { 5790 VATTR_SET(vap, va_uid, dmp->mnt_fsowner); 5791 defaulted_owner = 1; 5792 } else { 5793 if (!VATTR_IS_ACTIVE(vap, va_uid)) { 5794 /* default owner is current user */ 5795 VATTR_SET(vap, va_uid, kauth_cred_getuid(vfs_context_ucred(ctx))); 5796 defaulted_owner = 1; 5797 } 5798 } 5799 5800 /* 5801 * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit grouo is set, that 5802 * group takes ownership of all new files. 5803 */ 5804 if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsgroup != KAUTH_GID_NONE)) { 5805 VATTR_SET(vap, va_gid, dmp->mnt_fsgroup); 5806 defaulted_group = 1; 5807 } else { 5808 if (!VATTR_IS_ACTIVE(vap, va_gid)) { 5809 /* default group comes from parent object, fallback to current user */ 5810 struct vnode_attr dva; 5811 VATTR_INIT(&dva); 5812 VATTR_WANTED(&dva, va_gid); 5813 if ((error = vnode_getattr(dvp, &dva, ctx)) != 0) 5814 goto out; 5815 if (VATTR_IS_SUPPORTED(&dva, va_gid)) { 5816 VATTR_SET(vap, va_gid, dva.va_gid); 5817 } else { 5818 VATTR_SET(vap, va_gid, kauth_cred_getgid(vfs_context_ucred(ctx))); 5819 } 5820 defaulted_group = 1; 5821 } 5822 } 5823 5824 if (!VATTR_IS_ACTIVE(vap, va_flags)) 5825 VATTR_SET(vap, va_flags, 0); 5826 5827 /* default mode is everything, masked with current umask */ 5828 if (!VATTR_IS_ACTIVE(vap, va_mode)) { 5829 VATTR_SET(vap, va_mode, ACCESSPERMS & ~vfs_context_proc(ctx)->p_fd->fd_cmask); 5830 KAUTH_DEBUG("ATTR - defaulting new file mode to %o from umask %o", vap->va_mode, vfs_context_proc(ctx)->p_fd->fd_cmask); 5831 defaulted_mode = 1; 5832 } 5833 /* set timestamps to now */ 5834 if (!VATTR_IS_ACTIVE(vap, va_create_time)) { 5835 nanotime(&vap->va_create_time); 5836 VATTR_SET_ACTIVE(vap, va_create_time); 5837 } 5838 5839 /* 5840 * Check for attempts to set nonsensical fields. 5841 */ 5842 if (vap->va_active & ~VNODE_ATTR_NEWOBJ) { 5843 error = EINVAL; 5844 KAUTH_DEBUG("ATTR - ERROR - attempt to set unsupported new-file attributes %llx", 5845 vap->va_active & ~VNODE_ATTR_NEWOBJ); 5846 goto out; 5847 } 5848 5849 /* 5850 * Quickly check for the applicability of any enforcement here. 5851 * Tests below maintain the integrity of the local security model. 5852 */ 5853 if (vfs_authopaque(dvp->v_mount)) 5854 goto out; 5855 5856 /* 5857 * We need to know if the caller is the superuser, or if the work is 5858 * otherwise already authorised. 5859 */ 5860 cred = vfs_context_ucred(ctx); 5861 if (noauth) { 5862 /* doing work for the kernel */ 5863 has_priv_suser = 1; 5864 } else { 5865 has_priv_suser = vfs_context_issuser(ctx); 5866 } 5867 5868 5869 if (VATTR_IS_ACTIVE(vap, va_flags)) { 5870 if (has_priv_suser) { 5871 if ((vap->va_flags & (UF_SETTABLE | SF_SETTABLE)) != vap->va_flags) { 5872 error = EPERM; 5873 KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)"); 5874 goto out; 5875 } 5876 } else { 5877 if ((vap->va_flags & UF_SETTABLE) != vap->va_flags) { 5878 error = EPERM; 5879 KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)"); 5880 goto out; 5881 } 5882 } 5883 } 5884 5885 /* if not superuser, validate legality of new-item attributes */ 5886 if (!has_priv_suser) { 5887 if (!defaulted_mode && VATTR_IS_ACTIVE(vap, va_mode)) { 5888 /* setgid? */ 5889 if (vap->va_mode & S_ISGID) { 5890 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) { 5891 KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid); 5892 goto out; 5893 } 5894 if (!ismember) { 5895 KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", vap->va_gid); 5896 error = EPERM; 5897 goto out; 5898 } 5899 } 5900 5901 /* setuid? */ 5902 if ((vap->va_mode & S_ISUID) && (vap->va_uid != kauth_cred_getuid(cred))) { 5903 KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit"); 5904 error = EPERM; 5905 goto out; 5906 } 5907 } 5908 if (!defaulted_owner && (vap->va_uid != kauth_cred_getuid(cred))) { 5909 KAUTH_DEBUG(" DENIED - cannot create new item owned by %d", vap->va_uid); 5910 error = EPERM; 5911 goto out; 5912 } 5913 if (!defaulted_group) { 5914 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) { 5915 KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid); 5916 goto out; 5917 } 5918 if (!ismember) { 5919 KAUTH_DEBUG(" DENIED - cannot create new item with group %d - not a member", vap->va_gid); 5920 error = EPERM; 5921 goto out; 5922 } 5923 } 5924 5925 /* initialising owner/group UUID */ 5926 if (VATTR_IS_ACTIVE(vap, va_uuuid)) { 5927 if ((error = kauth_cred_getguid(cred, &changer)) != 0) { 5928 KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error); 5929 /* XXX ENOENT here - no GUID - should perhaps become EPERM */ 5930 goto out; 5931 } 5932 if (!kauth_guid_equal(&vap->va_uuuid, &changer)) { 5933 KAUTH_DEBUG(" ERROR - cannot create item with supplied owner UUID - not us"); 5934 error = EPERM; 5935 goto out; 5936 } 5937 } 5938 if (VATTR_IS_ACTIVE(vap, va_guuid)) { 5939 if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) { 5940 KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error); 5941 goto out; 5942 } 5943 if (!ismember) { 5944 KAUTH_DEBUG(" ERROR - cannot create item with supplied group UUID - not a member"); 5945 error = EPERM; 5946 goto out; 5947 } 5948 } 5949 } 5950out: 5951 return(error); 5952} 5953 5954/* 5955 * Check that the attribute information in vap can be legally written by the 5956 * context. 5957 * 5958 * Call this when you're not sure about the vnode_attr; either its contents 5959 * have come from an unknown source, or when they are variable. 5960 * 5961 * Returns errno, or zero and sets *actionp to the KAUTH_VNODE_* actions that 5962 * must be authorized to be permitted to write the vattr. 5963 */ 5964int 5965vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_context_t ctx) 5966{ 5967 struct vnode_attr ova; 5968 kauth_action_t required_action; 5969 int error, has_priv_suser, ismember, chowner, chgroup, clear_suid, clear_sgid; 5970 guid_t changer; 5971 gid_t group; 5972 uid_t owner; 5973 mode_t newmode; 5974 kauth_cred_t cred; 5975 uint32_t fdelta; 5976 5977 VATTR_INIT(&ova); 5978 required_action = 0; 5979 error = 0; 5980 5981 /* 5982 * Quickly check for enforcement applicability. 5983 */ 5984 if (vfs_authopaque(vp->v_mount)) 5985 goto out; 5986 5987 /* 5988 * Check for attempts to set nonsensical fields. 5989 */ 5990 if (vap->va_active & VNODE_ATTR_RDONLY) { 5991 KAUTH_DEBUG("ATTR - ERROR: attempt to set readonly attribute(s)"); 5992 error = EINVAL; 5993 goto out; 5994 } 5995 5996 /* 5997 * We need to know if the caller is the superuser. 5998 */ 5999 cred = vfs_context_ucred(ctx); 6000 has_priv_suser = kauth_cred_issuser(cred); 6001 6002 /* 6003 * If any of the following are changing, we need information from the old file: 6004 * va_uid 6005 * va_gid 6006 * va_mode 6007 * va_uuuid 6008 * va_guuid 6009 */ 6010 if (VATTR_IS_ACTIVE(vap, va_uid) || 6011 VATTR_IS_ACTIVE(vap, va_gid) || 6012 VATTR_IS_ACTIVE(vap, va_mode) || 6013 VATTR_IS_ACTIVE(vap, va_uuuid) || 6014 VATTR_IS_ACTIVE(vap, va_guuid)) { 6015 VATTR_WANTED(&ova, va_mode); 6016 VATTR_WANTED(&ova, va_uid); 6017 VATTR_WANTED(&ova, va_gid); 6018 VATTR_WANTED(&ova, va_uuuid); 6019 VATTR_WANTED(&ova, va_guuid); 6020 KAUTH_DEBUG("ATTR - security information changing, fetching existing attributes"); 6021 } 6022 6023 /* 6024 * If timestamps are being changed, we need to know who the file is owned 6025 * by. 6026 */ 6027 if (VATTR_IS_ACTIVE(vap, va_create_time) || 6028 VATTR_IS_ACTIVE(vap, va_change_time) || 6029 VATTR_IS_ACTIVE(vap, va_modify_time) || 6030 VATTR_IS_ACTIVE(vap, va_access_time) || 6031 VATTR_IS_ACTIVE(vap, va_backup_time)) { 6032 6033 VATTR_WANTED(&ova, va_uid); 6034#if 0 /* enable this when we support UUIDs as official owners */ 6035 VATTR_WANTED(&ova, va_uuuid); 6036#endif 6037 KAUTH_DEBUG("ATTR - timestamps changing, fetching uid and GUID"); 6038 } 6039 6040 /* 6041 * If flags are being changed, we need the old flags. 6042 */ 6043 if (VATTR_IS_ACTIVE(vap, va_flags)) { 6044 KAUTH_DEBUG("ATTR - flags changing, fetching old flags"); 6045 VATTR_WANTED(&ova, va_flags); 6046 } 6047 6048 /* 6049 * If the size is being set, make sure it's not a directory. 6050 */ 6051 if (VATTR_IS_ACTIVE(vap, va_data_size)) { 6052 /* size is meaningless on a directory, don't permit this */ 6053 if (vnode_isdir(vp)) { 6054 KAUTH_DEBUG("ATTR - ERROR: size change requested on a directory"); 6055 error = EISDIR; 6056 goto out; 6057 } 6058 } 6059 6060 /* 6061 * Get old data. 6062 */ 6063 KAUTH_DEBUG("ATTR - fetching old attributes %016llx", ova.va_active); 6064 if ((error = vnode_getattr(vp, &ova, ctx)) != 0) { 6065 KAUTH_DEBUG(" ERROR - got %d trying to get attributes", error); 6066 goto out; 6067 } 6068 6069 /* 6070 * Size changes require write access to the file data. 6071 */ 6072 if (VATTR_IS_ACTIVE(vap, va_data_size)) { 6073 /* if we can't get the size, or it's different, we need write access */ 6074 KAUTH_DEBUG("ATTR - size change, requiring WRITE_DATA"); 6075 required_action |= KAUTH_VNODE_WRITE_DATA; 6076 } 6077 6078 /* 6079 * Changing timestamps? 6080 * 6081 * Note that we are only called to authorize user-requested time changes; 6082 * side-effect time changes are not authorized. Authorisation is only 6083 * required for existing files. 6084 * 6085 * Non-owners are not permitted to change the time on an existing 6086 * file to anything other than the current time. 6087 */ 6088 if (VATTR_IS_ACTIVE(vap, va_create_time) || 6089 VATTR_IS_ACTIVE(vap, va_change_time) || 6090 VATTR_IS_ACTIVE(vap, va_modify_time) || 6091 VATTR_IS_ACTIVE(vap, va_access_time) || 6092 VATTR_IS_ACTIVE(vap, va_backup_time)) { 6093 /* 6094 * The owner and root may set any timestamps they like, 6095 * provided that the file is not immutable. The owner still needs 6096 * WRITE_ATTRIBUTES (implied by ownership but still deniable). 6097 */ 6098 if (has_priv_suser || vauth_node_owner(&ova, cred)) { 6099 KAUTH_DEBUG("ATTR - root or owner changing timestamps"); 6100 required_action |= KAUTH_VNODE_CHECKIMMUTABLE | KAUTH_VNODE_WRITE_ATTRIBUTES; 6101 } else { 6102 /* just setting the current time? */ 6103 if (vap->va_vaflags & VA_UTIMES_NULL) { 6104 KAUTH_DEBUG("ATTR - non-root/owner changing timestamps, requiring WRITE_ATTRIBUTES"); 6105 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES; 6106 } else { 6107 KAUTH_DEBUG("ATTR - ERROR: illegal timestamp modification attempted"); 6108 error = EACCES; 6109 goto out; 6110 } 6111 } 6112 } 6113 6114 /* 6115 * Changing file mode? 6116 */ 6117 if (VATTR_IS_ACTIVE(vap, va_mode) && VATTR_IS_SUPPORTED(&ova, va_mode) && (ova.va_mode != vap->va_mode)) { 6118 KAUTH_DEBUG("ATTR - mode change from %06o to %06o", ova.va_mode, vap->va_mode); 6119 6120 /* 6121 * Mode changes always have the same basic auth requirements. 6122 */ 6123 if (has_priv_suser) { 6124 KAUTH_DEBUG("ATTR - superuser mode change, requiring immutability check"); 6125 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; 6126 } else { 6127 /* need WRITE_SECURITY */ 6128 KAUTH_DEBUG("ATTR - non-superuser mode change, requiring WRITE_SECURITY"); 6129 required_action |= KAUTH_VNODE_WRITE_SECURITY; 6130 } 6131 6132 /* 6133 * Can't set the setgid bit if you're not in the group and not root. Have to have 6134 * existing group information in the case we're not setting it right now. 6135 */ 6136 if (vap->va_mode & S_ISGID) { 6137 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */ 6138 if (!has_priv_suser) { 6139 if (VATTR_IS_ACTIVE(vap, va_gid)) { 6140 group = vap->va_gid; 6141 } else if (VATTR_IS_SUPPORTED(&ova, va_gid)) { 6142 group = ova.va_gid; 6143 } else { 6144 KAUTH_DEBUG("ATTR - ERROR: setgid but no gid available"); 6145 error = EINVAL; 6146 goto out; 6147 } 6148 /* 6149 * This might be too restrictive; WRITE_SECURITY might be implied by 6150 * membership in this case, rather than being an additional requirement. 6151 */ 6152 if ((error = kauth_cred_ismember_gid(cred, group, &ismember)) != 0) { 6153 KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid); 6154 goto out; 6155 } 6156 if (!ismember) { 6157 KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", group); 6158 error = EPERM; 6159 goto out; 6160 } 6161 } 6162 } 6163 6164 /* 6165 * Can't set the setuid bit unless you're root or the file's owner. 6166 */ 6167 if (vap->va_mode & S_ISUID) { 6168 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */ 6169 if (!has_priv_suser) { 6170 if (VATTR_IS_ACTIVE(vap, va_uid)) { 6171 owner = vap->va_uid; 6172 } else if (VATTR_IS_SUPPORTED(&ova, va_uid)) { 6173 owner = ova.va_uid; 6174 } else { 6175 KAUTH_DEBUG("ATTR - ERROR: setuid but no uid available"); 6176 error = EINVAL; 6177 goto out; 6178 } 6179 if (owner != kauth_cred_getuid(cred)) { 6180 /* 6181 * We could allow this if WRITE_SECURITY is permitted, perhaps. 6182 */ 6183 KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit"); 6184 error = EPERM; 6185 goto out; 6186 } 6187 } 6188 } 6189 } 6190 6191 /* 6192 * Validate/mask flags changes. This checks that only the flags in 6193 * the UF_SETTABLE mask are being set, and preserves the flags in 6194 * the SF_SETTABLE case. 6195 * 6196 * Since flags changes may be made in conjunction with other changes, 6197 * we will ask the auth code to ignore immutability in the case that 6198 * the SF_* flags are not set and we are only manipulating the file flags. 6199 * 6200 */ 6201 if (VATTR_IS_ACTIVE(vap, va_flags)) { 6202 /* compute changing flags bits */ 6203 if (VATTR_IS_SUPPORTED(&ova, va_flags)) { 6204 fdelta = vap->va_flags ^ ova.va_flags; 6205 } else { 6206 fdelta = vap->va_flags; 6207 } 6208 6209 if (fdelta != 0) { 6210 KAUTH_DEBUG("ATTR - flags changing, requiring WRITE_SECURITY"); 6211 required_action |= KAUTH_VNODE_WRITE_SECURITY; 6212 6213 /* check that changing bits are legal */ 6214 if (has_priv_suser) { 6215 /* 6216 * The immutability check will prevent us from clearing the SF_* 6217 * flags unless the system securelevel permits it, so just check 6218 * for legal flags here. 6219 */ 6220 if (fdelta & ~(UF_SETTABLE | SF_SETTABLE)) { 6221 error = EPERM; 6222 KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)"); 6223 goto out; 6224 } 6225 } else { 6226 if (fdelta & ~UF_SETTABLE) { 6227 error = EPERM; 6228 KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)"); 6229 goto out; 6230 } 6231 } 6232 /* 6233 * If the caller has the ability to manipulate file flags, 6234 * security is not reduced by ignoring them for this operation. 6235 * 6236 * A more complete test here would consider the 'after' states of the flags 6237 * to determine whether it would permit the operation, but this becomes 6238 * very complex. 6239 * 6240 * Ignoring immutability is conditional on securelevel; this does not bypass 6241 * the SF_* flags if securelevel > 0. 6242 */ 6243 required_action |= KAUTH_VNODE_NOIMMUTABLE; 6244 } 6245 } 6246 6247 /* 6248 * Validate ownership information. 6249 */ 6250 chowner = 0; 6251 chgroup = 0; 6252 clear_suid = 0; 6253 clear_sgid = 0; 6254 6255 /* 6256 * uid changing 6257 * Note that if the filesystem didn't give us a UID, we expect that it doesn't 6258 * support them in general, and will ignore it if/when we try to set it. 6259 * We might want to clear the uid out of vap completely here. 6260 */ 6261 if (VATTR_IS_ACTIVE(vap, va_uid)) { 6262 if (VATTR_IS_SUPPORTED(&ova, va_uid) && (vap->va_uid != ova.va_uid)) { 6263 if (!has_priv_suser && (kauth_cred_getuid(cred) != vap->va_uid)) { 6264 KAUTH_DEBUG(" DENIED - non-superuser cannot change ownershipt to a third party"); 6265 error = EPERM; 6266 goto out; 6267 } 6268 chowner = 1; 6269 } 6270 clear_suid = 1; 6271 } 6272 6273 /* 6274 * gid changing 6275 * Note that if the filesystem didn't give us a GID, we expect that it doesn't 6276 * support them in general, and will ignore it if/when we try to set it. 6277 * We might want to clear the gid out of vap completely here. 6278 */ 6279 if (VATTR_IS_ACTIVE(vap, va_gid)) { 6280 if (VATTR_IS_SUPPORTED(&ova, va_gid) && (vap->va_gid != ova.va_gid)) { 6281 if (!has_priv_suser) { 6282 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) { 6283 KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid); 6284 goto out; 6285 } 6286 if (!ismember) { 6287 KAUTH_DEBUG(" DENIED - group change from %d to %d but not a member of target group", 6288 ova.va_gid, vap->va_gid); 6289 error = EPERM; 6290 goto out; 6291 } 6292 } 6293 chgroup = 1; 6294 } 6295 clear_sgid = 1; 6296 } 6297 6298 /* 6299 * Owner UUID being set or changed. 6300 */ 6301 if (VATTR_IS_ACTIVE(vap, va_uuuid)) { 6302 /* if the owner UUID is not actually changing ... */ 6303 if (VATTR_IS_SUPPORTED(&ova, va_uuuid) && kauth_guid_equal(&vap->va_uuuid, &ova.va_uuuid)) 6304 goto no_uuuid_change; 6305 6306 /* 6307 * The owner UUID cannot be set by a non-superuser to anything other than 6308 * their own. 6309 */ 6310 if (!has_priv_suser) { 6311 if ((error = kauth_cred_getguid(cred, &changer)) != 0) { 6312 KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error); 6313 /* XXX ENOENT here - no UUID - should perhaps become EPERM */ 6314 goto out; 6315 } 6316 if (!kauth_guid_equal(&vap->va_uuuid, &changer)) { 6317 KAUTH_DEBUG(" ERROR - cannot set supplied owner UUID - not us"); 6318 error = EPERM; 6319 goto out; 6320 } 6321 } 6322 chowner = 1; 6323 clear_suid = 1; 6324 } 6325no_uuuid_change: 6326 /* 6327 * Group UUID being set or changed. 6328 */ 6329 if (VATTR_IS_ACTIVE(vap, va_guuid)) { 6330 /* if the group UUID is not actually changing ... */ 6331 if (VATTR_IS_SUPPORTED(&ova, va_guuid) && kauth_guid_equal(&vap->va_guuid, &ova.va_guuid)) 6332 goto no_guuid_change; 6333 6334 /* 6335 * The group UUID cannot be set by a non-superuser to anything other than 6336 * one of which they are a member. 6337 */ 6338 if (!has_priv_suser) { 6339 if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) { 6340 KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error); 6341 goto out; 6342 } 6343 if (!ismember) { 6344 KAUTH_DEBUG(" ERROR - cannot create item with supplied group UUID - not a member"); 6345 error = EPERM; 6346 goto out; 6347 } 6348 } 6349 chgroup = 1; 6350 } 6351no_guuid_change: 6352 6353 /* 6354 * Compute authorisation for group/ownership changes. 6355 */ 6356 if (chowner || chgroup || clear_suid || clear_sgid) { 6357 if (has_priv_suser) { 6358 KAUTH_DEBUG("ATTR - superuser changing file owner/group, requiring immutability check"); 6359 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; 6360 } else { 6361 if (chowner) { 6362 KAUTH_DEBUG("ATTR - ownership change, requiring TAKE_OWNERSHIP"); 6363 required_action |= KAUTH_VNODE_TAKE_OWNERSHIP; 6364 } 6365 if (chgroup && !chowner) { 6366 KAUTH_DEBUG("ATTR - group change, requiring WRITE_SECURITY"); 6367 required_action |= KAUTH_VNODE_WRITE_SECURITY; 6368 } 6369 6370 /* clear set-uid and set-gid bits as required by Posix */ 6371 if (VATTR_IS_ACTIVE(vap, va_mode)) { 6372 newmode = vap->va_mode; 6373 } else if (VATTR_IS_SUPPORTED(&ova, va_mode)) { 6374 newmode = ova.va_mode; 6375 } else { 6376 KAUTH_DEBUG("CHOWN - trying to change owner but cannot get mode from filesystem to mask setugid bits"); 6377 newmode = 0; 6378 } 6379 if (newmode & (S_ISUID | S_ISGID)) { 6380 VATTR_SET(vap, va_mode, newmode & ~(S_ISUID | S_ISGID)); 6381 KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o", newmode, vap->va_mode); 6382 } 6383 } 6384 } 6385 6386 /* 6387 * Authorise changes in the ACL. 6388 */ 6389 if (VATTR_IS_ACTIVE(vap, va_acl)) { 6390 6391 /* no existing ACL */ 6392 if (!VATTR_IS_ACTIVE(&ova, va_acl) || (ova.va_acl == NULL)) { 6393 6394 /* adding an ACL */ 6395 if (vap->va_acl != NULL) { 6396 required_action |= KAUTH_VNODE_WRITE_SECURITY; 6397 KAUTH_DEBUG("CHMOD - adding ACL"); 6398 } 6399 6400 /* removing an existing ACL */ 6401 } else if (vap->va_acl == NULL) { 6402 required_action |= KAUTH_VNODE_WRITE_SECURITY; 6403 KAUTH_DEBUG("CHMOD - removing ACL"); 6404 6405 /* updating an existing ACL */ 6406 } else { 6407 if (vap->va_acl->acl_entrycount != ova.va_acl->acl_entrycount) { 6408 /* entry count changed, must be different */ 6409 required_action |= KAUTH_VNODE_WRITE_SECURITY; 6410 KAUTH_DEBUG("CHMOD - adding/removing ACL entries"); 6411 } else if (vap->va_acl->acl_entrycount > 0) { 6412 /* both ACLs have the same ACE count, said count is 1 or more, bitwise compare ACLs */ 6413 if (!memcmp(&vap->va_acl->acl_ace[0], &ova.va_acl->acl_ace[0], 6414 sizeof(struct kauth_ace) * vap->va_acl->acl_entrycount)) { 6415 required_action |= KAUTH_VNODE_WRITE_SECURITY; 6416 KAUTH_DEBUG("CHMOD - changing ACL entries"); 6417 } 6418 } 6419 } 6420 } 6421 6422 /* 6423 * Other attributes that require authorisation. 6424 */ 6425 if (VATTR_IS_ACTIVE(vap, va_encoding)) 6426 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES; 6427 6428out: 6429 if (VATTR_IS_SUPPORTED(&ova, va_acl) && (ova.va_acl != NULL)) 6430 kauth_acl_free(ova.va_acl); 6431 if (error == 0) 6432 *actionp = required_action; 6433 return(error); 6434} 6435 6436 6437void 6438vfs_setlocklocal(mount_t mp) 6439{ 6440 vnode_t vp; 6441 6442 mount_lock(mp); 6443 mp->mnt_kern_flag |= MNTK_LOCK_LOCAL; 6444 6445 /* 6446 * We do not expect anyone to be using any vnodes at the 6447 * time this routine is called. So no need for vnode locking 6448 */ 6449 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 6450 vp->v_flag |= VLOCKLOCAL; 6451 } 6452 TAILQ_FOREACH(vp, &mp->mnt_workerqueue, v_mntvnodes) { 6453 vp->v_flag |= VLOCKLOCAL; 6454 } 6455 TAILQ_FOREACH(vp, &mp->mnt_newvnodes, v_mntvnodes) { 6456 vp->v_flag |= VLOCKLOCAL; 6457 } 6458 mount_unlock(mp); 6459} 6460 6461void 6462vn_setunionwait(vnode_t vp) 6463{ 6464 vnode_lock_spin(vp); 6465 vp->v_flag |= VISUNION; 6466 vnode_unlock(vp); 6467} 6468 6469 6470void 6471vn_checkunionwait(vnode_t vp) 6472{ 6473 vnode_lock(vp); 6474 while ((vp->v_flag & VISUNION) == VISUNION) 6475 msleep((caddr_t)&vp->v_flag, &vp->v_lock, 0, 0, 0); 6476 vnode_unlock(vp); 6477} 6478 6479void 6480vn_clearunionwait(vnode_t vp, int locked) 6481{ 6482 if (!locked) 6483 vnode_lock(vp); 6484 if((vp->v_flag & VISUNION) == VISUNION) { 6485 vp->v_flag &= ~VISUNION; 6486 wakeup((caddr_t)&vp->v_flag); 6487 } 6488 if (!locked) 6489 vnode_unlock(vp); 6490} 6491 6492/* 6493 * XXX - get "don't trigger mounts" flag for thread; used by autofs. 6494 */ 6495extern int thread_notrigger(void); 6496 6497int 6498thread_notrigger(void) 6499{ 6500 struct uthread *uth = (struct uthread *)get_bsdthread_info(current_thread()); 6501 return (uth->uu_notrigger); 6502} 6503 6504/* 6505 * Removes orphaned apple double files during a rmdir 6506 * Works by: 6507 * 1. vnode_suspend(). 6508 * 2. Call VNOP_READDIR() till the end of directory is reached. 6509 * 3. Check if the directory entries returned are regular files with name starting with "._". If not, return ENOTEMPTY. 6510 * 4. Continue (2) and (3) till end of directory is reached. 6511 * 5. If all the entries in the directory were files with "._" name, delete all the files. 6512 * 6. vnode_resume() 6513 * 7. If deletion of all files succeeded, call VNOP_RMDIR() again. 6514 */ 6515 6516errno_t rmdir_remove_orphaned_appleDouble(vnode_t vp , vfs_context_t ctx, int * restart_flag) 6517{ 6518 6519#define UIO_BUFF_SIZE 2048 6520 uio_t auio = NULL; 6521 int eofflag, siz = UIO_BUFF_SIZE, nentries = 0; 6522 int open_flag = 0, full_erase_flag = 0; 6523 char uio_buf[ UIO_SIZEOF(1) ]; 6524 char *rbuf = NULL, *cpos, *cend; 6525 struct nameidata nd_temp; 6526 struct dirent *dp; 6527 errno_t error; 6528 6529 error = vnode_suspend(vp); 6530 6531 /* 6532 * restart_flag is set so that the calling rmdir sleeps and resets 6533 */ 6534 if (error == EBUSY) 6535 *restart_flag = 1; 6536 if (error != 0) 6537 goto outsc; 6538 6539 /* 6540 * set up UIO 6541 */ 6542 MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK); 6543 if (rbuf) 6544 auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, 6545 &uio_buf[0], sizeof(uio_buf)); 6546 if (!rbuf || !auio) { 6547 error = ENOMEM; 6548 goto outsc; 6549 } 6550 6551 uio_setoffset(auio,0); 6552 6553 eofflag = 0; 6554 6555 if ((error = VNOP_OPEN(vp, FREAD, ctx))) 6556 goto outsc; 6557 else 6558 open_flag = 1; 6559 6560 /* 6561 * First pass checks if all files are appleDouble files. 6562 */ 6563 6564 do { 6565 siz = UIO_BUFF_SIZE; 6566 uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ); 6567 uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE); 6568 6569 if((error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx))) 6570 goto outsc; 6571 6572 if (uio_resid(auio) != 0) 6573 siz -= uio_resid(auio); 6574 6575 /* 6576 * Iterate through directory 6577 */ 6578 cpos = rbuf; 6579 cend = rbuf + siz; 6580 dp = (struct dirent*) cpos; 6581 6582 if (cpos == cend) 6583 eofflag = 1; 6584 6585 while ((cpos < cend)) { 6586 /* 6587 * Check for . and .. as well as directories 6588 */ 6589 if (dp->d_ino != 0 && 6590 !((dp->d_namlen == 1 && dp->d_name[0] == '.') || 6591 (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'))) { 6592 /* 6593 * Check for irregular files and ._ files 6594 * If there is a ._._ file abort the op 6595 */ 6596 if ( dp->d_namlen < 2 || 6597 strncmp(dp->d_name,"._",2) || 6598 (dp->d_namlen >= 4 && !strncmp(&(dp->d_name[2]), "._",2))) { 6599 error = ENOTEMPTY; 6600 goto outsc; 6601 } 6602 } 6603 cpos += dp->d_reclen; 6604 dp = (struct dirent*)cpos; 6605 } 6606 6607 /* 6608 * workaround for HFS/NFS setting eofflag before end of file 6609 */ 6610 if (vp->v_tag == VT_HFS && nentries > 2) 6611 eofflag=0; 6612 6613 if (vp->v_tag == VT_NFS) { 6614 if (eofflag && !full_erase_flag) { 6615 full_erase_flag = 1; 6616 eofflag = 0; 6617 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ); 6618 } 6619 else if (!eofflag && full_erase_flag) 6620 full_erase_flag = 0; 6621 } 6622 6623 } while (!eofflag); 6624 /* 6625 * If we've made it here all the files in the dir are AppleDouble 6626 * We can delete the files even though the node is suspended 6627 * because we are the owner of the file. 6628 */ 6629 6630 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ); 6631 eofflag = 0; 6632 full_erase_flag = 0; 6633 6634 do { 6635 siz = UIO_BUFF_SIZE; 6636 uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ); 6637 uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE); 6638 6639 error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx); 6640 6641 if (error != 0) 6642 goto outsc; 6643 6644 if (uio_resid(auio) != 0) 6645 siz -= uio_resid(auio); 6646 6647 /* 6648 * Iterate through directory 6649 */ 6650 cpos = rbuf; 6651 cend = rbuf + siz; 6652 dp = (struct dirent*) cpos; 6653 6654 if (cpos == cend) 6655 eofflag = 1; 6656 6657 while ((cpos < cend)) { 6658 /* 6659 * Check for . and .. as well as directories 6660 */ 6661 if (dp->d_ino != 0 && 6662 !((dp->d_namlen == 1 && dp->d_name[0] == '.') || 6663 (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.')) 6664 ) { 6665 NDINIT(&nd_temp, DELETE, USEDVP, UIO_SYSSPACE, CAST_USER_ADDR_T(dp->d_name), ctx); 6666 nd_temp.ni_dvp = vp; 6667 error = unlink1(ctx, &nd_temp, 0); 6668 if(error && error != ENOENT) 6669 goto outsc; 6670 } 6671 cpos += dp->d_reclen; 6672 dp = (struct dirent*)cpos; 6673 } 6674 6675 /* 6676 * workaround for HFS/NFS setting eofflag before end of file 6677 */ 6678 if (vp->v_tag == VT_HFS && nentries > 2) 6679 eofflag=0; 6680 6681 if (vp->v_tag == VT_NFS) { 6682 if (eofflag && !full_erase_flag) { 6683 full_erase_flag = 1; 6684 eofflag = 0; 6685 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ); 6686 } 6687 else if (!eofflag && full_erase_flag) 6688 full_erase_flag = 0; 6689 } 6690 6691 } while (!eofflag); 6692 6693 6694 error = 0; 6695 6696outsc: 6697 if (open_flag) 6698 VNOP_CLOSE(vp, FREAD, ctx); 6699 6700 uio_free(auio); 6701 FREE(rbuf, M_TEMP); 6702 6703 vnode_resume(vp); 6704 6705 6706 return(error); 6707 6708} 6709 6710 6711#ifdef JOE_DEBUG 6712 6713record_vp(vnode_t vp, int count) { 6714 struct uthread *ut; 6715 int i; 6716 6717 if ((vp->v_flag & VSYSTEM)) 6718 return; 6719 6720 ut = get_bsdthread_info(current_thread()); 6721 ut->uu_iocount += count; 6722 6723 if (ut->uu_vpindex < 32) { 6724 for (i = 0; i < ut->uu_vpindex; i++) { 6725 if (ut->uu_vps[i] == vp) 6726 return; 6727 } 6728 ut->uu_vps[ut->uu_vpindex] = vp; 6729 ut->uu_vpindex++; 6730 } 6731} 6732#endif 6733