1/* 2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ 29/* 30 * Copyright (c) 1989, 1993 31 * The Regents of the University of California. All rights reserved. 32 * (c) UNIX System Laboratories, Inc. 33 * All or some portions of this file are derived from material licensed 34 * to the University of California by American Telephone and Telegraph 35 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 36 * the permission of UNIX System Laboratories, Inc. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. All advertising materials mentioning features or use of this software 47 * must display the following acknowledgement: 48 * This product includes software developed by the University of 49 * California, Berkeley and its contributors. 50 * 4. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 67 */ 68/* 69 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce 70 * support for mandatory and extensible security protections. This notice 71 * is included in support of clause 2.2 (b) of the Apple Public License, 72 * Version 2.0. 73 */ 74 75/* 76 * External virtual filesystem routines 77 */ 78 79 80#include <sys/param.h> 81#include <sys/systm.h> 82#include <sys/proc_internal.h> 83#include <sys/kauth.h> 84#include <sys/mount_internal.h> 85#include <sys/time.h> 86#include <sys/lock.h> 87#include <sys/vnode.h> 88#include <sys/vnode_internal.h> 89#include <sys/stat.h> 90#include <sys/namei.h> 91#include <sys/ucred.h> 92#include <sys/buf_internal.h> 93#include <sys/errno.h> 94#include <sys/malloc.h> 95#include <sys/uio_internal.h> 96#include <sys/uio.h> 97#include <sys/domain.h> 98#include <sys/mbuf.h> 99#include <sys/syslog.h> 100#include <sys/ubc_internal.h> 101#include <sys/vm.h> 102#include <sys/sysctl.h> 103#include <sys/filedesc.h> 104#include <sys/event.h> 105#include <sys/kdebug.h> 106#include <sys/kauth.h> 107#include <sys/user.h> 108#include <sys/systm.h> 109#include <sys/kern_memorystatus.h> 110#include <sys/lockf.h> 111#include <miscfs/fifofs/fifo.h> 112 113#include <string.h> 114#include <machine/spl.h> 115 116 117#include <kern/assert.h> 118#include <mach/kern_return.h> 119#include <kern/thread.h> 120#include <kern/sched_prim.h> 121 122#include <miscfs/specfs/specdev.h> 123 124#include <mach/mach_types.h> 125#include <mach/memory_object_types.h> 126#include <mach/memory_object_control.h> 127 128#include <kern/kalloc.h> /* kalloc()/kfree() */ 129#include <kern/clock.h> /* delay_for_interval() */ 130#include <libkern/OSAtomic.h> /* OSAddAtomic() */ 131 132 133#ifdef JOE_DEBUG 134#include <libkern/OSDebug.h> 135#endif 136 137#include <vm/vm_protos.h> /* vnode_pager_vrele() */ 138 139#if CONFIG_MACF 140#include <security/mac_framework.h> 141#endif 142 143extern lck_grp_t *vnode_lck_grp; 144extern lck_attr_t *vnode_lck_attr; 145 146#if CONFIG_TRIGGERS 147extern lck_grp_t *trigger_vnode_lck_grp; 148extern lck_attr_t *trigger_vnode_lck_attr; 149#endif 150 151extern lck_mtx_t * mnt_list_mtx_lock; 152 153enum vtype iftovt_tab[16] = { 154 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 155 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 156}; 157int vttoif_tab[9] = { 158 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 159 S_IFSOCK, S_IFIFO, S_IFMT, 160}; 161 162 163/* XXX These should be in a BSD accessible Mach header, but aren't. */ 164extern void memory_object_mark_used( 165 memory_object_control_t control); 166 167extern void memory_object_mark_unused( 168 memory_object_control_t control, 169 boolean_t rage); 170 171 172/* XXX next protptype should be from <nfs/nfs.h> */ 173extern int nfs_vinvalbuf(vnode_t, int, vfs_context_t, int); 174 175/* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */ 176__private_extern__ void qsort( 177 void * array, 178 size_t nmembers, 179 size_t member_size, 180 int (*)(const void *, const void *)); 181 182extern kern_return_t adjust_vm_object_cache(vm_size_t oval, vm_size_t nval); 183__private_extern__ void vntblinit(void); 184__private_extern__ kern_return_t reset_vmobjectcache(unsigned int val1, 185 unsigned int val2); 186__private_extern__ int unlink1(vfs_context_t, struct nameidata *, int); 187 188extern int system_inshutdown; 189 190static void vnode_list_add(vnode_t); 191static void vnode_async_list_add(vnode_t); 192static void vnode_list_remove(vnode_t); 193static void vnode_list_remove_locked(vnode_t); 194 195static void vnode_abort_advlocks(vnode_t); 196static errno_t vnode_drain(vnode_t); 197static void vgone(vnode_t, int flags); 198static void vclean(vnode_t vp, int flag); 199static void vnode_reclaim_internal(vnode_t, int, int, int); 200 201static void vnode_dropiocount (vnode_t); 202 203static vnode_t checkalias(vnode_t vp, dev_t nvp_rdev); 204static int vnode_reload(vnode_t); 205static int vnode_isinuse_locked(vnode_t, int, int); 206 207static void insmntque(vnode_t vp, mount_t mp); 208static int mount_getvfscnt(void); 209static int mount_fillfsids(fsid_t *, int ); 210static void vnode_iterate_setup(mount_t); 211int vnode_umount_preflight(mount_t, vnode_t, int); 212static int vnode_iterate_prepare(mount_t); 213static int vnode_iterate_reloadq(mount_t); 214static void vnode_iterate_clear(mount_t); 215static mount_t vfs_getvfs_locked(fsid_t *); 216static int vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, 217 struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx); 218static int vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx); 219 220errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *); 221 222#ifdef JOE_DEBUG 223static void record_vp(vnode_t vp, int count); 224#endif 225 226#if CONFIG_TRIGGERS 227static int vnode_resolver_create(mount_t, vnode_t, struct vnode_trigger_param *, boolean_t external); 228static void vnode_resolver_detach(vnode_t); 229#endif 230 231TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ 232TAILQ_HEAD(deadlst, vnode) vnode_dead_list; /* vnode dead list */ 233TAILQ_HEAD(async_work_lst, vnode) vnode_async_work_list; 234 235 236TAILQ_HEAD(ragelst, vnode) vnode_rage_list; /* vnode rapid age list */ 237struct timeval rage_tv; 238int rage_limit = 0; 239int ragevnodes = 0; 240 241#define RAGE_LIMIT_MIN 100 242#define RAGE_TIME_LIMIT 5 243 244struct mntlist mountlist; /* mounted filesystem list */ 245static int nummounts = 0; 246 247#if DIAGNOSTIC 248#define VLISTCHECK(fun, vp, list) \ 249 if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \ 250 panic("%s: %s vnode not on %slist", (fun), (list), (list)); 251#else 252#define VLISTCHECK(fun, vp, list) 253#endif /* DIAGNOSTIC */ 254 255#define VLISTNONE(vp) \ 256 do { \ 257 (vp)->v_freelist.tqe_next = (struct vnode *)0; \ 258 (vp)->v_freelist.tqe_prev = (struct vnode **)0xdeadb; \ 259 } while(0) 260 261#define VONLIST(vp) \ 262 ((vp)->v_freelist.tqe_prev != (struct vnode **)0xdeadb) 263 264/* remove a vnode from free vnode list */ 265#define VREMFREE(fun, vp) \ 266 do { \ 267 VLISTCHECK((fun), (vp), "free"); \ 268 TAILQ_REMOVE(&vnode_free_list, (vp), v_freelist); \ 269 VLISTNONE((vp)); \ 270 freevnodes--; \ 271 } while(0) 272 273 274/* remove a vnode from dead vnode list */ 275#define VREMDEAD(fun, vp) \ 276 do { \ 277 VLISTCHECK((fun), (vp), "dead"); \ 278 TAILQ_REMOVE(&vnode_dead_list, (vp), v_freelist); \ 279 VLISTNONE((vp)); \ 280 vp->v_listflag &= ~VLIST_DEAD; \ 281 deadvnodes--; \ 282 } while(0) 283 284 285/* remove a vnode from async work vnode list */ 286#define VREMASYNC_WORK(fun, vp) \ 287 do { \ 288 VLISTCHECK((fun), (vp), "async_work"); \ 289 TAILQ_REMOVE(&vnode_async_work_list, (vp), v_freelist); \ 290 VLISTNONE((vp)); \ 291 vp->v_listflag &= ~VLIST_ASYNC_WORK; \ 292 async_work_vnodes--; \ 293 } while(0) 294 295 296/* remove a vnode from rage vnode list */ 297#define VREMRAGE(fun, vp) \ 298 do { \ 299 if ( !(vp->v_listflag & VLIST_RAGE)) \ 300 panic("VREMRAGE: vp not on rage list"); \ 301 VLISTCHECK((fun), (vp), "rage"); \ 302 TAILQ_REMOVE(&vnode_rage_list, (vp), v_freelist); \ 303 VLISTNONE((vp)); \ 304 vp->v_listflag &= ~VLIST_RAGE; \ 305 ragevnodes--; \ 306 } while(0) 307 308 309/* 310 * vnodetarget hasn't been used in a long time, but 311 * it was exported for some reason... I'm leaving in 312 * place for now... it should be deprecated out of the 313 * exports and removed eventually. 314 */ 315u_int32_t vnodetarget; /* target for vnreclaim() */ 316#define VNODE_FREE_TARGET 20 /* Default value for vnodetarget */ 317 318/* 319 * We need quite a few vnodes on the free list to sustain the 320 * rapid stat() the compilation process does, and still benefit from the name 321 * cache. Having too few vnodes on the free list causes serious disk 322 * thrashing as we cycle through them. 323 */ 324#define VNODE_FREE_MIN CONFIG_VNODE_FREE_MIN /* freelist should have at least this many */ 325 326 327static void async_work_continue(void); 328 329/* 330 * Initialize the vnode management data structures. 331 */ 332__private_extern__ void 333vntblinit(void) 334{ 335 thread_t thread = THREAD_NULL; 336 337 TAILQ_INIT(&vnode_free_list); 338 TAILQ_INIT(&vnode_rage_list); 339 TAILQ_INIT(&vnode_dead_list); 340 TAILQ_INIT(&vnode_async_work_list); 341 TAILQ_INIT(&mountlist); 342 343 if (!vnodetarget) 344 vnodetarget = VNODE_FREE_TARGET; 345 346 microuptime(&rage_tv); 347 rage_limit = desiredvnodes / 100; 348 349 if (rage_limit < RAGE_LIMIT_MIN) 350 rage_limit = RAGE_LIMIT_MIN; 351 352 /* 353 * Scale the vm_object_cache to accomodate the vnodes 354 * we want to cache 355 */ 356 (void) adjust_vm_object_cache(0, desiredvnodes - VNODE_FREE_MIN); 357 358 /* 359 * create worker threads 360 */ 361 kernel_thread_start((thread_continue_t)async_work_continue, NULL, &thread); 362 thread_deallocate(thread); 363} 364 365/* Reset the VM Object Cache with the values passed in */ 366__private_extern__ kern_return_t 367reset_vmobjectcache(unsigned int val1, unsigned int val2) 368{ 369 vm_size_t oval = val1 - VNODE_FREE_MIN; 370 vm_size_t nval; 371 372 if (val1 == val2) { 373 return KERN_SUCCESS; 374 } 375 376 if(val2 < VNODE_FREE_MIN) 377 nval = 0; 378 else 379 nval = val2 - VNODE_FREE_MIN; 380 381 return(adjust_vm_object_cache(oval, nval)); 382} 383 384 385/* the timeout is in 10 msecs */ 386int 387vnode_waitforwrites(vnode_t vp, int output_target, int slpflag, int slptimeout, const char *msg) { 388 int error = 0; 389 struct timespec ts; 390 391 KERNEL_DEBUG(0x3010280 | DBG_FUNC_START, (int)vp, output_target, vp->v_numoutput, 0, 0); 392 393 if (vp->v_numoutput > output_target) { 394 395 slpflag |= PDROP; 396 397 vnode_lock_spin(vp); 398 399 while ((vp->v_numoutput > output_target) && error == 0) { 400 if (output_target) 401 vp->v_flag |= VTHROTTLED; 402 else 403 vp->v_flag |= VBWAIT; 404 405 ts.tv_sec = (slptimeout/100); 406 ts.tv_nsec = (slptimeout % 1000) * 10 * NSEC_PER_USEC * 1000 ; 407 error = msleep((caddr_t)&vp->v_numoutput, &vp->v_lock, (slpflag | (PRIBIO + 1)), msg, &ts); 408 409 vnode_lock_spin(vp); 410 } 411 vnode_unlock(vp); 412 } 413 KERNEL_DEBUG(0x3010280 | DBG_FUNC_END, (int)vp, output_target, vp->v_numoutput, error, 0); 414 415 return error; 416} 417 418 419void 420vnode_startwrite(vnode_t vp) { 421 422 OSAddAtomic(1, &vp->v_numoutput); 423} 424 425 426void 427vnode_writedone(vnode_t vp) 428{ 429 if (vp) { 430 int need_wakeup = 0; 431 432 OSAddAtomic(-1, &vp->v_numoutput); 433 434 vnode_lock_spin(vp); 435 436 if (vp->v_numoutput < 0) 437 panic("vnode_writedone: numoutput < 0"); 438 439 if ((vp->v_flag & VTHROTTLED)) { 440 vp->v_flag &= ~VTHROTTLED; 441 need_wakeup = 1; 442 } 443 if ((vp->v_flag & VBWAIT) && (vp->v_numoutput == 0)) { 444 vp->v_flag &= ~VBWAIT; 445 need_wakeup = 1; 446 } 447 vnode_unlock(vp); 448 449 if (need_wakeup) 450 wakeup((caddr_t)&vp->v_numoutput); 451 } 452} 453 454 455 456int 457vnode_hasdirtyblks(vnode_t vp) 458{ 459 struct cl_writebehind *wbp; 460 461 /* 462 * Not taking the buf_mtxp as there is little 463 * point doing it. Even if the lock is taken the 464 * state can change right after that. If their 465 * needs to be a synchronization, it must be driven 466 * by the caller 467 */ 468 if (vp->v_dirtyblkhd.lh_first) 469 return (1); 470 471 if (!UBCINFOEXISTS(vp)) 472 return (0); 473 474 wbp = vp->v_ubcinfo->cl_wbehind; 475 476 if (wbp && (wbp->cl_number || wbp->cl_scmap)) 477 return (1); 478 479 return (0); 480} 481 482int 483vnode_hascleanblks(vnode_t vp) 484{ 485 /* 486 * Not taking the buf_mtxp as there is little 487 * point doing it. Even if the lock is taken the 488 * state can change right after that. If their 489 * needs to be a synchronization, it must be driven 490 * by the caller 491 */ 492 if (vp->v_cleanblkhd.lh_first) 493 return (1); 494 return (0); 495} 496 497void 498vnode_iterate_setup(mount_t mp) 499{ 500 while (mp->mnt_lflag & MNT_LITER) { 501 mp->mnt_lflag |= MNT_LITERWAIT; 502 msleep((caddr_t)mp, &mp->mnt_mlock, PVFS, "vnode_iterate_setup", NULL); 503 } 504 505 mp->mnt_lflag |= MNT_LITER; 506 507} 508 509int 510vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags) 511{ 512 vnode_t vp; 513 514 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 515 /* disable preflight only for udf, a hack to be removed after 4073176 is fixed */ 516 if (vp->v_tag == VT_UDF) 517 return 0; 518 if (vp->v_type == VDIR) 519 continue; 520 if (vp == skipvp) 521 continue; 522 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || 523 (vp->v_flag & VNOFLUSH))) 524 continue; 525 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) 526 continue; 527 if ((flags & WRITECLOSE) && 528 (vp->v_writecount == 0 || vp->v_type != VREG)) 529 continue; 530 /* Look for busy vnode */ 531 if (((vp->v_usecount != 0) && 532 ((vp->v_usecount - vp->v_kusecount) != 0))) 533 return(1); 534 } 535 536 return(0); 537} 538 539/* 540 * This routine prepares iteration by moving all the vnodes to worker queue 541 * called with mount lock held 542 */ 543int 544vnode_iterate_prepare(mount_t mp) 545{ 546 vnode_t vp; 547 548 if (TAILQ_EMPTY(&mp->mnt_vnodelist)) { 549 /* nothing to do */ 550 return (0); 551 } 552 553 vp = TAILQ_FIRST(&mp->mnt_vnodelist); 554 vp->v_mntvnodes.tqe_prev = &(mp->mnt_workerqueue.tqh_first); 555 mp->mnt_workerqueue.tqh_first = mp->mnt_vnodelist.tqh_first; 556 mp->mnt_workerqueue.tqh_last = mp->mnt_vnodelist.tqh_last; 557 558 TAILQ_INIT(&mp->mnt_vnodelist); 559 if (mp->mnt_newvnodes.tqh_first != NULL) 560 panic("vnode_iterate_prepare: newvnode when entering vnode"); 561 TAILQ_INIT(&mp->mnt_newvnodes); 562 563 return (1); 564} 565 566 567/* called with mount lock held */ 568int 569vnode_iterate_reloadq(mount_t mp) 570{ 571 int moved = 0; 572 573 /* add the remaining entries in workerq to the end of mount vnode list */ 574 if (!TAILQ_EMPTY(&mp->mnt_workerqueue)) { 575 struct vnode * mvp; 576 mvp = TAILQ_LAST(&mp->mnt_vnodelist, vnodelst); 577 578 /* Joining the workerque entities to mount vnode list */ 579 if (mvp) 580 mvp->v_mntvnodes.tqe_next = mp->mnt_workerqueue.tqh_first; 581 else 582 mp->mnt_vnodelist.tqh_first = mp->mnt_workerqueue.tqh_first; 583 mp->mnt_workerqueue.tqh_first->v_mntvnodes.tqe_prev = mp->mnt_vnodelist.tqh_last; 584 mp->mnt_vnodelist.tqh_last = mp->mnt_workerqueue.tqh_last; 585 TAILQ_INIT(&mp->mnt_workerqueue); 586 } 587 588 /* add the newvnodes to the head of mount vnode list */ 589 if (!TAILQ_EMPTY(&mp->mnt_newvnodes)) { 590 struct vnode * nlvp; 591 nlvp = TAILQ_LAST(&mp->mnt_newvnodes, vnodelst); 592 593 mp->mnt_newvnodes.tqh_first->v_mntvnodes.tqe_prev = &mp->mnt_vnodelist.tqh_first; 594 nlvp->v_mntvnodes.tqe_next = mp->mnt_vnodelist.tqh_first; 595 if(mp->mnt_vnodelist.tqh_first) 596 mp->mnt_vnodelist.tqh_first->v_mntvnodes.tqe_prev = &nlvp->v_mntvnodes.tqe_next; 597 else 598 mp->mnt_vnodelist.tqh_last = mp->mnt_newvnodes.tqh_last; 599 mp->mnt_vnodelist.tqh_first = mp->mnt_newvnodes.tqh_first; 600 TAILQ_INIT(&mp->mnt_newvnodes); 601 moved = 1; 602 } 603 604 return(moved); 605} 606 607 608void 609vnode_iterate_clear(mount_t mp) 610{ 611 mp->mnt_lflag &= ~MNT_LITER; 612 if (mp->mnt_lflag & MNT_LITERWAIT) { 613 mp->mnt_lflag &= ~MNT_LITERWAIT; 614 wakeup(mp); 615 } 616} 617 618 619int 620vnode_iterate(mount_t mp, int flags, int (*callout)(struct vnode *, void *), 621 void *arg) 622{ 623 struct vnode *vp; 624 int vid, retval; 625 int ret = 0; 626 627 mount_lock(mp); 628 629 vnode_iterate_setup(mp); 630 631 /* it is returns 0 then there is nothing to do */ 632 retval = vnode_iterate_prepare(mp); 633 634 if (retval == 0) { 635 vnode_iterate_clear(mp); 636 mount_unlock(mp); 637 return(ret); 638 } 639 640 /* iterate over all the vnodes */ 641 while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) { 642 vp = TAILQ_FIRST(&mp->mnt_workerqueue); 643 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes); 644 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes); 645 vid = vp->v_id; 646 if ((vp->v_data == NULL) || (vp->v_type == VNON) || (vp->v_mount != mp)) { 647 continue; 648 } 649 mount_unlock(mp); 650 651 if ( vget_internal(vp, vid, (flags | VNODE_NODEAD| VNODE_WITHID | VNODE_NOSUSPEND))) { 652 mount_lock(mp); 653 continue; 654 } 655 if (flags & VNODE_RELOAD) { 656 /* 657 * we're reloading the filesystem 658 * cast out any inactive vnodes... 659 */ 660 if (vnode_reload(vp)) { 661 /* vnode will be recycled on the refcount drop */ 662 vnode_put(vp); 663 mount_lock(mp); 664 continue; 665 } 666 } 667 668 retval = callout(vp, arg); 669 670 switch (retval) { 671 case VNODE_RETURNED: 672 case VNODE_RETURNED_DONE: 673 vnode_put(vp); 674 if (retval == VNODE_RETURNED_DONE) { 675 mount_lock(mp); 676 ret = 0; 677 goto out; 678 } 679 break; 680 681 case VNODE_CLAIMED_DONE: 682 mount_lock(mp); 683 ret = 0; 684 goto out; 685 case VNODE_CLAIMED: 686 default: 687 break; 688 } 689 mount_lock(mp); 690 } 691 692out: 693 (void)vnode_iterate_reloadq(mp); 694 vnode_iterate_clear(mp); 695 mount_unlock(mp); 696 return (ret); 697} 698 699void 700mount_lock_renames(mount_t mp) 701{ 702 lck_mtx_lock(&mp->mnt_renamelock); 703} 704 705void 706mount_unlock_renames(mount_t mp) 707{ 708 lck_mtx_unlock(&mp->mnt_renamelock); 709} 710 711void 712mount_lock(mount_t mp) 713{ 714 lck_mtx_lock(&mp->mnt_mlock); 715} 716 717void 718mount_lock_spin(mount_t mp) 719{ 720 lck_mtx_lock_spin(&mp->mnt_mlock); 721} 722 723void 724mount_unlock(mount_t mp) 725{ 726 lck_mtx_unlock(&mp->mnt_mlock); 727} 728 729 730void 731mount_ref(mount_t mp, int locked) 732{ 733 if ( !locked) 734 mount_lock_spin(mp); 735 736 mp->mnt_count++; 737 738 if ( !locked) 739 mount_unlock(mp); 740} 741 742 743void 744mount_drop(mount_t mp, int locked) 745{ 746 if ( !locked) 747 mount_lock_spin(mp); 748 749 mp->mnt_count--; 750 751 if (mp->mnt_count == 0 && (mp->mnt_lflag & MNT_LDRAIN)) 752 wakeup(&mp->mnt_lflag); 753 754 if ( !locked) 755 mount_unlock(mp); 756} 757 758 759int 760mount_iterref(mount_t mp, int locked) 761{ 762 int retval = 0; 763 764 if (!locked) 765 mount_list_lock(); 766 if (mp->mnt_iterref < 0) { 767 retval = 1; 768 } else { 769 mp->mnt_iterref++; 770 } 771 if (!locked) 772 mount_list_unlock(); 773 return(retval); 774} 775 776int 777mount_isdrained(mount_t mp, int locked) 778{ 779 int retval; 780 781 if (!locked) 782 mount_list_lock(); 783 if (mp->mnt_iterref < 0) 784 retval = 1; 785 else 786 retval = 0; 787 if (!locked) 788 mount_list_unlock(); 789 return(retval); 790} 791 792void 793mount_iterdrop(mount_t mp) 794{ 795 mount_list_lock(); 796 mp->mnt_iterref--; 797 wakeup(&mp->mnt_iterref); 798 mount_list_unlock(); 799} 800 801void 802mount_iterdrain(mount_t mp) 803{ 804 mount_list_lock(); 805 while (mp->mnt_iterref) 806 msleep((caddr_t)&mp->mnt_iterref, mnt_list_mtx_lock, PVFS, "mount_iterdrain", NULL); 807 /* mount iterations drained */ 808 mp->mnt_iterref = -1; 809 mount_list_unlock(); 810} 811void 812mount_iterreset(mount_t mp) 813{ 814 mount_list_lock(); 815 if (mp->mnt_iterref == -1) 816 mp->mnt_iterref = 0; 817 mount_list_unlock(); 818} 819 820/* always called with mount lock held */ 821int 822mount_refdrain(mount_t mp) 823{ 824 if (mp->mnt_lflag & MNT_LDRAIN) 825 panic("already in drain"); 826 mp->mnt_lflag |= MNT_LDRAIN; 827 828 while (mp->mnt_count) 829 msleep((caddr_t)&mp->mnt_lflag, &mp->mnt_mlock, PVFS, "mount_drain", NULL); 830 831 if (mp->mnt_vnodelist.tqh_first != NULL) 832 panic("mount_refdrain: dangling vnode"); 833 834 mp->mnt_lflag &= ~MNT_LDRAIN; 835 836 return(0); 837} 838 839/* Tags the mount point as not supportine extended readdir for NFS exports */ 840void 841mount_set_noreaddirext(mount_t mp) { 842 mount_lock (mp); 843 mp->mnt_kern_flag |= MNTK_DENY_READDIREXT; 844 mount_unlock (mp); 845} 846 847/* 848 * Mark a mount point as busy. Used to synchronize access and to delay 849 * unmounting. 850 */ 851int 852vfs_busy(mount_t mp, int flags) 853{ 854 855restart: 856 if (mp->mnt_lflag & MNT_LDEAD) 857 return(ENOENT); 858 859 if (mp->mnt_lflag & MNT_LUNMOUNT) { 860 if (flags & LK_NOWAIT) 861 return (ENOENT); 862 863 mount_lock(mp); 864 865 if (mp->mnt_lflag & MNT_LDEAD) { 866 mount_unlock(mp); 867 return(ENOENT); 868 } 869 if (mp->mnt_lflag & MNT_LUNMOUNT) { 870 mp->mnt_lflag |= MNT_LWAIT; 871 /* 872 * Since all busy locks are shared except the exclusive 873 * lock granted when unmounting, the only place that a 874 * wakeup needs to be done is at the release of the 875 * exclusive lock at the end of dounmount. 876 */ 877 msleep((caddr_t)mp, &mp->mnt_mlock, (PVFS | PDROP), "vfsbusy", NULL); 878 return (ENOENT); 879 } 880 mount_unlock(mp); 881 } 882 883 lck_rw_lock_shared(&mp->mnt_rwlock); 884 885 /* 886 * until we are granted the rwlock, it's possible for the mount point to 887 * change state, so reevaluate before granting the vfs_busy 888 */ 889 if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) { 890 lck_rw_done(&mp->mnt_rwlock); 891 goto restart; 892 } 893 return (0); 894} 895 896/* 897 * Free a busy filesystem. 898 */ 899 900void 901vfs_unbusy(mount_t mp) 902{ 903 lck_rw_done(&mp->mnt_rwlock); 904} 905 906 907 908static void 909vfs_rootmountfailed(mount_t mp) { 910 911 mount_list_lock(); 912 mp->mnt_vtable->vfc_refcount--; 913 mount_list_unlock(); 914 915 vfs_unbusy(mp); 916 917 mount_lock_destroy(mp); 918 919#if CONFIG_MACF 920 mac_mount_label_destroy(mp); 921#endif 922 923 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT); 924} 925 926/* 927 * Lookup a filesystem type, and if found allocate and initialize 928 * a mount structure for it. 929 * 930 * Devname is usually updated by mount(8) after booting. 931 */ 932static mount_t 933vfs_rootmountalloc_internal(struct vfstable *vfsp, const char *devname) 934{ 935 mount_t mp; 936 937 mp = _MALLOC_ZONE(sizeof(struct mount), M_MOUNT, M_WAITOK); 938 bzero((char *)mp, sizeof(struct mount)); 939 940 /* Initialize the default IO constraints */ 941 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS; 942 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32; 943 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt; 944 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt; 945 mp->mnt_devblocksize = DEV_BSIZE; 946 mp->mnt_alignmentmask = PAGE_MASK; 947 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH; 948 mp->mnt_ioscale = 1; 949 mp->mnt_ioflags = 0; 950 mp->mnt_realrootvp = NULLVP; 951 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL; 952 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1; 953 mp->mnt_devbsdunit = 0; 954 955 mount_lock_init(mp); 956 (void)vfs_busy(mp, LK_NOWAIT); 957 958 TAILQ_INIT(&mp->mnt_vnodelist); 959 TAILQ_INIT(&mp->mnt_workerqueue); 960 TAILQ_INIT(&mp->mnt_newvnodes); 961 962 mp->mnt_vtable = vfsp; 963 mp->mnt_op = vfsp->vfc_vfsops; 964 mp->mnt_flag = MNT_RDONLY | MNT_ROOTFS; 965 mp->mnt_vnodecovered = NULLVP; 966 //mp->mnt_stat.f_type = vfsp->vfc_typenum; 967 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; 968 969 mount_list_lock(); 970 vfsp->vfc_refcount++; 971 mount_list_unlock(); 972 973 strncpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN); 974 mp->mnt_vfsstat.f_mntonname[0] = '/'; 975 /* XXX const poisoning layering violation */ 976 (void) copystr((const void *)devname, mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN - 1, NULL); 977 978#if CONFIG_MACF 979 mac_mount_label_init(mp); 980 mac_mount_label_associate(vfs_context_kernel(), mp); 981#endif 982 return (mp); 983} 984 985errno_t 986vfs_rootmountalloc(const char *fstypename, const char *devname, mount_t *mpp) 987{ 988 struct vfstable *vfsp; 989 990 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 991 if (!strncmp(vfsp->vfc_name, fstypename, 992 sizeof(vfsp->vfc_name))) 993 break; 994 if (vfsp == NULL) 995 return (ENODEV); 996 997 *mpp = vfs_rootmountalloc_internal(vfsp, devname); 998 999 if (*mpp) 1000 return (0); 1001 1002 return (ENOMEM); 1003} 1004 1005 1006/* 1007 * Find an appropriate filesystem to use for the root. If a filesystem 1008 * has not been preselected, walk through the list of known filesystems 1009 * trying those that have mountroot routines, and try them until one 1010 * works or we have tried them all. 1011 */ 1012extern int (*mountroot)(void); 1013 1014int 1015vfs_mountroot(void) 1016{ 1017#if CONFIG_MACF 1018 struct vnode *vp; 1019#endif 1020 struct vfstable *vfsp; 1021 vfs_context_t ctx = vfs_context_kernel(); 1022 struct vfs_attr vfsattr; 1023 int error; 1024 mount_t mp; 1025 vnode_t bdevvp_rootvp; 1026 1027 if (mountroot != NULL) { 1028 /* 1029 * used for netboot which follows a different set of rules 1030 */ 1031 error = (*mountroot)(); 1032 return (error); 1033 } 1034 if ((error = bdevvp(rootdev, &rootvp))) { 1035 printf("vfs_mountroot: can't setup bdevvp\n"); 1036 return (error); 1037 } 1038 /* 1039 * 4951998 - code we call in vfc_mountroot may replace rootvp 1040 * so keep a local copy for some house keeping. 1041 */ 1042 bdevvp_rootvp = rootvp; 1043 1044 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 1045 if (vfsp->vfc_mountroot == NULL) 1046 continue; 1047 1048 mp = vfs_rootmountalloc_internal(vfsp, "root_device"); 1049 mp->mnt_devvp = rootvp; 1050 1051 if ((error = (*vfsp->vfc_mountroot)(mp, rootvp, ctx)) == 0) { 1052 if ( bdevvp_rootvp != rootvp ) { 1053 /* 1054 * rootvp changed... 1055 * bump the iocount and fix up mnt_devvp for the 1056 * new rootvp (it will already have a usecount taken)... 1057 * drop the iocount and the usecount on the orignal 1058 * since we are no longer going to use it... 1059 */ 1060 vnode_getwithref(rootvp); 1061 mp->mnt_devvp = rootvp; 1062 1063 vnode_rele(bdevvp_rootvp); 1064 vnode_put(bdevvp_rootvp); 1065 } 1066 mp->mnt_devvp->v_specflags |= SI_MOUNTEDON; 1067 1068 vfs_unbusy(mp); 1069 1070 mount_list_add(mp); 1071 1072 /* 1073 * cache the IO attributes for the underlying physical media... 1074 * an error return indicates the underlying driver doesn't 1075 * support all the queries necessary... however, reasonable 1076 * defaults will have been set, so no reason to bail or care 1077 */ 1078 vfs_init_io_attributes(rootvp, mp); 1079 1080 /* 1081 * Shadow the VFC_VFSNATIVEXATTR flag to MNTK_EXTENDED_ATTRS. 1082 */ 1083 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) { 1084 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS; 1085 } 1086 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) { 1087 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT; 1088 } 1089 1090 /* 1091 * Probe root file system for additional features. 1092 */ 1093 (void)VFS_START(mp, 0, ctx); 1094 1095 VFSATTR_INIT(&vfsattr); 1096 VFSATTR_WANTED(&vfsattr, f_capabilities); 1097 if (vfs_getattr(mp, &vfsattr, ctx) == 0 && 1098 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) { 1099 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) && 1100 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) { 1101 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS; 1102 } 1103#if NAMEDSTREAMS 1104 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) && 1105 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) { 1106 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS; 1107 } 1108#endif 1109 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) && 1110 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) { 1111 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID; 1112 } 1113 } 1114 1115 /* 1116 * get rid of iocount reference returned 1117 * by bdevvp (or picked up by us on the substitued 1118 * rootvp)... it (or we) will have also taken 1119 * a usecount reference which we want to keep 1120 */ 1121 vnode_put(rootvp); 1122 1123#if CONFIG_MACF 1124 if ((vfs_flags(mp) & MNT_MULTILABEL) == 0) 1125 return (0); 1126 1127 error = VFS_ROOT(mp, &vp, ctx); 1128 if (error) { 1129 printf("%s() VFS_ROOT() returned %d\n", 1130 __func__, error); 1131 dounmount(mp, MNT_FORCE, 0, ctx); 1132 goto fail; 1133 } 1134 error = vnode_label(mp, NULL, vp, NULL, 0, ctx); 1135 /* 1136 * get rid of reference provided by VFS_ROOT 1137 */ 1138 vnode_put(vp); 1139 1140 if (error) { 1141 printf("%s() vnode_label() returned %d\n", 1142 __func__, error); 1143 dounmount(mp, MNT_FORCE, 0, ctx); 1144 goto fail; 1145 } 1146#endif 1147 return (0); 1148 } 1149#if CONFIG_MACF 1150fail: 1151#endif 1152 vfs_rootmountfailed(mp); 1153 1154 if (error != EINVAL) 1155 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 1156 } 1157 return (ENODEV); 1158} 1159 1160/* 1161 * Lookup a mount point by filesystem identifier. 1162 */ 1163 1164struct mount * 1165vfs_getvfs(fsid_t *fsid) 1166{ 1167 return (mount_list_lookupby_fsid(fsid, 0, 0)); 1168} 1169 1170static struct mount * 1171vfs_getvfs_locked(fsid_t *fsid) 1172{ 1173 return(mount_list_lookupby_fsid(fsid, 1, 0)); 1174} 1175 1176struct mount * 1177vfs_getvfs_by_mntonname(char *path) 1178{ 1179 mount_t retmp = (mount_t)0; 1180 mount_t mp; 1181 1182 mount_list_lock(); 1183 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 1184 if (!strncmp(mp->mnt_vfsstat.f_mntonname, path, 1185 sizeof(mp->mnt_vfsstat.f_mntonname))) { 1186 retmp = mp; 1187 if (mount_iterref(retmp, 1)) 1188 retmp = NULL; 1189 goto out; 1190 } 1191 } 1192out: 1193 mount_list_unlock(); 1194 return (retmp); 1195} 1196 1197/* generation number for creation of new fsids */ 1198u_short mntid_gen = 0; 1199/* 1200 * Get a new unique fsid 1201 */ 1202void 1203vfs_getnewfsid(struct mount *mp) 1204{ 1205 1206 fsid_t tfsid; 1207 int mtype; 1208 mount_t nmp; 1209 1210 mount_list_lock(); 1211 1212 /* generate a new fsid */ 1213 mtype = mp->mnt_vtable->vfc_typenum; 1214 if (++mntid_gen == 0) 1215 mntid_gen++; 1216 tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen); 1217 tfsid.val[1] = mtype; 1218 1219 TAILQ_FOREACH(nmp, &mountlist, mnt_list) { 1220 while (vfs_getvfs_locked(&tfsid)) { 1221 if (++mntid_gen == 0) 1222 mntid_gen++; 1223 tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen); 1224 } 1225 } 1226 mp->mnt_vfsstat.f_fsid.val[0] = tfsid.val[0]; 1227 mp->mnt_vfsstat.f_fsid.val[1] = tfsid.val[1]; 1228 mount_list_unlock(); 1229} 1230 1231/* 1232 * Routines having to do with the management of the vnode table. 1233 */ 1234extern int (**dead_vnodeop_p)(void *); 1235long numvnodes, freevnodes, deadvnodes, async_work_vnodes; 1236 1237 1238int async_work_timed_out = 0; 1239int async_work_handled = 0; 1240int dead_vnode_wanted = 0; 1241int dead_vnode_waited = 0; 1242 1243/* 1244 * Move a vnode from one mount queue to another. 1245 */ 1246static void 1247insmntque(vnode_t vp, mount_t mp) 1248{ 1249 mount_t lmp; 1250 /* 1251 * Delete from old mount point vnode list, if on one. 1252 */ 1253 if ( (lmp = vp->v_mount) != NULL && lmp != dead_mountp) { 1254 if ((vp->v_lflag & VNAMED_MOUNT) == 0) 1255 panic("insmntque: vp not in mount vnode list"); 1256 vp->v_lflag &= ~VNAMED_MOUNT; 1257 1258 mount_lock_spin(lmp); 1259 1260 mount_drop(lmp, 1); 1261 1262 if (vp->v_mntvnodes.tqe_next == NULL) { 1263 if (TAILQ_LAST(&lmp->mnt_vnodelist, vnodelst) == vp) 1264 TAILQ_REMOVE(&lmp->mnt_vnodelist, vp, v_mntvnodes); 1265 else if (TAILQ_LAST(&lmp->mnt_newvnodes, vnodelst) == vp) 1266 TAILQ_REMOVE(&lmp->mnt_newvnodes, vp, v_mntvnodes); 1267 else if (TAILQ_LAST(&lmp->mnt_workerqueue, vnodelst) == vp) 1268 TAILQ_REMOVE(&lmp->mnt_workerqueue, vp, v_mntvnodes); 1269 } else { 1270 vp->v_mntvnodes.tqe_next->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_prev; 1271 *vp->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_next; 1272 } 1273 vp->v_mntvnodes.tqe_next = NULL; 1274 vp->v_mntvnodes.tqe_prev = NULL; 1275 mount_unlock(lmp); 1276 return; 1277 } 1278 1279 /* 1280 * Insert into list of vnodes for the new mount point, if available. 1281 */ 1282 if ((vp->v_mount = mp) != NULL) { 1283 mount_lock_spin(mp); 1284 if ((vp->v_mntvnodes.tqe_next != 0) && (vp->v_mntvnodes.tqe_prev != 0)) 1285 panic("vp already in mount list"); 1286 if (mp->mnt_lflag & MNT_LITER) 1287 TAILQ_INSERT_HEAD(&mp->mnt_newvnodes, vp, v_mntvnodes); 1288 else 1289 TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 1290 if (vp->v_lflag & VNAMED_MOUNT) 1291 panic("insmntque: vp already in mount vnode list"); 1292 vp->v_lflag |= VNAMED_MOUNT; 1293 mount_ref(mp, 1); 1294 mount_unlock(mp); 1295 } 1296} 1297 1298 1299/* 1300 * Create a vnode for a block device. 1301 * Used for root filesystem, argdev, and swap areas. 1302 * Also used for memory file system special devices. 1303 */ 1304int 1305bdevvp(dev_t dev, vnode_t *vpp) 1306{ 1307 vnode_t nvp; 1308 int error; 1309 struct vnode_fsparam vfsp; 1310 struct vfs_context context; 1311 1312 if (dev == NODEV) { 1313 *vpp = NULLVP; 1314 return (ENODEV); 1315 } 1316 1317 context.vc_thread = current_thread(); 1318 context.vc_ucred = FSCRED; 1319 1320 vfsp.vnfs_mp = (struct mount *)0; 1321 vfsp.vnfs_vtype = VBLK; 1322 vfsp.vnfs_str = "bdevvp"; 1323 vfsp.vnfs_dvp = NULL; 1324 vfsp.vnfs_fsnode = NULL; 1325 vfsp.vnfs_cnp = NULL; 1326 vfsp.vnfs_vops = spec_vnodeop_p; 1327 vfsp.vnfs_rdev = dev; 1328 vfsp.vnfs_filesize = 0; 1329 1330 vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE; 1331 1332 vfsp.vnfs_marksystem = 0; 1333 vfsp.vnfs_markroot = 0; 1334 1335 if ( (error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &nvp)) ) { 1336 *vpp = NULLVP; 1337 return (error); 1338 } 1339 vnode_lock_spin(nvp); 1340 nvp->v_flag |= VBDEVVP; 1341 nvp->v_tag = VT_NON; /* set this to VT_NON so during aliasing it can be replaced */ 1342 vnode_unlock(nvp); 1343 if ( (error = vnode_ref(nvp)) ) { 1344 panic("bdevvp failed: vnode_ref"); 1345 return (error); 1346 } 1347 if ( (error = VNOP_FSYNC(nvp, MNT_WAIT, &context)) ) { 1348 panic("bdevvp failed: fsync"); 1349 return (error); 1350 } 1351 if ( (error = buf_invalidateblks(nvp, BUF_WRITE_DATA, 0, 0)) ) { 1352 panic("bdevvp failed: invalidateblks"); 1353 return (error); 1354 } 1355 1356#if CONFIG_MACF 1357 /* 1358 * XXXMAC: We can't put a MAC check here, the system will 1359 * panic without this vnode. 1360 */ 1361#endif /* MAC */ 1362 1363 if ( (error = VNOP_OPEN(nvp, FREAD, &context)) ) { 1364 panic("bdevvp failed: open"); 1365 return (error); 1366 } 1367 *vpp = nvp; 1368 1369 return (0); 1370} 1371 1372/* 1373 * Check to see if the new vnode represents a special device 1374 * for which we already have a vnode (either because of 1375 * bdevvp() or because of a different vnode representing 1376 * the same block device). If such an alias exists, deallocate 1377 * the existing contents and return the aliased vnode. The 1378 * caller is responsible for filling it with its new contents. 1379 */ 1380static vnode_t 1381checkalias(struct vnode *nvp, dev_t nvp_rdev) 1382{ 1383 struct vnode *vp; 1384 struct vnode **vpp; 1385 struct specinfo *sin = NULL; 1386 int vid = 0; 1387 1388 vpp = &speclisth[SPECHASH(nvp_rdev)]; 1389loop: 1390 SPECHASH_LOCK(); 1391 1392 for (vp = *vpp; vp; vp = vp->v_specnext) { 1393 if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) { 1394 vid = vp->v_id; 1395 break; 1396 } 1397 } 1398 SPECHASH_UNLOCK(); 1399 1400 if (vp) { 1401found_alias: 1402 if (vnode_getwithvid(vp,vid)) { 1403 goto loop; 1404 } 1405 /* 1406 * Termination state is checked in vnode_getwithvid 1407 */ 1408 vnode_lock(vp); 1409 1410 /* 1411 * Alias, but not in use, so flush it out. 1412 */ 1413 if ((vp->v_iocount == 1) && (vp->v_usecount == 0)) { 1414 vnode_reclaim_internal(vp, 1, 1, 0); 1415 vnode_put_locked(vp); 1416 vnode_unlock(vp); 1417 goto loop; 1418 } 1419 1420 } 1421 if (vp == NULL || vp->v_tag != VT_NON) { 1422 if (sin == NULL) { 1423 MALLOC_ZONE(sin, struct specinfo *, sizeof(struct specinfo), 1424 M_SPECINFO, M_WAITOK); 1425 } 1426 1427 nvp->v_specinfo = sin; 1428 bzero(nvp->v_specinfo, sizeof(struct specinfo)); 1429 nvp->v_rdev = nvp_rdev; 1430 nvp->v_specflags = 0; 1431 nvp->v_speclastr = -1; 1432 nvp->v_specinfo->si_opencount = 0; 1433 nvp->v_specinfo->si_initted = 0; 1434 nvp->v_specinfo->si_throttleable = 0; 1435 1436 SPECHASH_LOCK(); 1437 1438 /* We dropped the lock, someone could have added */ 1439 if (vp == NULLVP) { 1440 for (vp = *vpp; vp; vp = vp->v_specnext) { 1441 if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) { 1442 vid = vp->v_id; 1443 SPECHASH_UNLOCK(); 1444 goto found_alias; 1445 } 1446 } 1447 } 1448 1449 nvp->v_hashchain = vpp; 1450 nvp->v_specnext = *vpp; 1451 *vpp = nvp; 1452 1453 if (vp != NULLVP) { 1454 nvp->v_specflags |= SI_ALIASED; 1455 vp->v_specflags |= SI_ALIASED; 1456 SPECHASH_UNLOCK(); 1457 vnode_put_locked(vp); 1458 vnode_unlock(vp); 1459 } else { 1460 SPECHASH_UNLOCK(); 1461 } 1462 1463 return (NULLVP); 1464 } 1465 1466 if (sin) { 1467 FREE_ZONE(sin, sizeof(struct specinfo), M_SPECINFO); 1468 } 1469 1470 if ((vp->v_flag & (VBDEVVP | VDEVFLUSH)) != 0) 1471 return(vp); 1472 1473 panic("checkalias with VT_NON vp that shouldn't: %p", vp); 1474 1475 return (vp); 1476} 1477 1478 1479/* 1480 * Get a reference on a particular vnode and lock it if requested. 1481 * If the vnode was on the inactive list, remove it from the list. 1482 * If the vnode was on the free list, remove it from the list and 1483 * move it to inactive list as needed. 1484 * The vnode lock bit is set if the vnode is being eliminated in 1485 * vgone. The process is awakened when the transition is completed, 1486 * and an error returned to indicate that the vnode is no longer 1487 * usable (possibly having been changed to a new file system type). 1488 */ 1489int 1490vget_internal(vnode_t vp, int vid, int vflags) 1491{ 1492 int error = 0; 1493 1494 vnode_lock_spin(vp); 1495 1496 if ((vflags & VNODE_WRITEABLE) && (vp->v_writecount == 0)) 1497 /* 1498 * vnode to be returned only if it has writers opened 1499 */ 1500 error = EINVAL; 1501 else 1502 error = vnode_getiocount(vp, vid, vflags); 1503 1504 vnode_unlock(vp); 1505 1506 return (error); 1507} 1508 1509/* 1510 * Returns: 0 Success 1511 * ENOENT No such file or directory [terminating] 1512 */ 1513int 1514vnode_ref(vnode_t vp) 1515{ 1516 1517 return (vnode_ref_ext(vp, 0, 0)); 1518} 1519 1520/* 1521 * Returns: 0 Success 1522 * ENOENT No such file or directory [terminating] 1523 */ 1524int 1525vnode_ref_ext(vnode_t vp, int fmode, int flags) 1526{ 1527 int error = 0; 1528 1529 vnode_lock_spin(vp); 1530 1531 /* 1532 * once all the current call sites have been fixed to insure they have 1533 * taken an iocount, we can toughen this assert up and insist that the 1534 * iocount is non-zero... a non-zero usecount doesn't insure correctness 1535 */ 1536 if (vp->v_iocount <= 0 && vp->v_usecount <= 0) 1537 panic("vnode_ref_ext: vp %p has no valid reference %d, %d", vp, vp->v_iocount, vp->v_usecount); 1538 1539 /* 1540 * if you are the owner of drain/termination, can acquire usecount 1541 */ 1542 if ((flags & VNODE_REF_FORCE) == 0) { 1543 if ((vp->v_lflag & (VL_DRAIN | VL_TERMINATE | VL_DEAD))) { 1544 if (vp->v_owner != current_thread()) { 1545 error = ENOENT; 1546 goto out; 1547 } 1548 } 1549 } 1550 vp->v_usecount++; 1551 1552 if (fmode & FWRITE) { 1553 if (++vp->v_writecount <= 0) 1554 panic("vnode_ref_ext: v_writecount"); 1555 } 1556 if (fmode & O_EVTONLY) { 1557 if (++vp->v_kusecount <= 0) 1558 panic("vnode_ref_ext: v_kusecount"); 1559 } 1560 if (vp->v_flag & VRAGE) { 1561 struct uthread *ut; 1562 1563 ut = get_bsdthread_info(current_thread()); 1564 1565 if ( !(current_proc()->p_lflag & P_LRAGE_VNODES) && 1566 !(ut->uu_flag & UT_RAGE_VNODES)) { 1567 /* 1568 * a 'normal' process accessed this vnode 1569 * so make sure its no longer marked 1570 * for rapid aging... also, make sure 1571 * it gets removed from the rage list... 1572 * when v_usecount drops back to 0, it 1573 * will be put back on the real free list 1574 */ 1575 vp->v_flag &= ~VRAGE; 1576 vp->v_references = 0; 1577 vnode_list_remove(vp); 1578 } 1579 } 1580 if (vp->v_usecount == 1 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) { 1581 1582 if (vp->v_ubcinfo) { 1583 vnode_lock_convert(vp); 1584 memory_object_mark_used(vp->v_ubcinfo->ui_control); 1585 } 1586 } 1587out: 1588 vnode_unlock(vp); 1589 1590 return (error); 1591} 1592 1593 1594static boolean_t 1595vnode_on_reliable_media(vnode_t vp) 1596{ 1597 if ( !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) && (vp->v_mount->mnt_flag & MNT_LOCAL) ) 1598 return (TRUE); 1599 return (FALSE); 1600} 1601 1602static void 1603vnode_async_list_add(vnode_t vp) 1604{ 1605 vnode_list_lock(); 1606 1607 if (VONLIST(vp) || (vp->v_lflag & (VL_TERMINATE|VL_DEAD))) 1608 panic("vnode_async_list_add: %p is in wrong state", vp); 1609 1610 TAILQ_INSERT_HEAD(&vnode_async_work_list, vp, v_freelist); 1611 vp->v_listflag |= VLIST_ASYNC_WORK; 1612 1613 async_work_vnodes++; 1614 1615 vnode_list_unlock(); 1616 1617 wakeup(&vnode_async_work_list); 1618 1619} 1620 1621 1622/* 1623 * put the vnode on appropriate free list. 1624 * called with vnode LOCKED 1625 */ 1626static void 1627vnode_list_add(vnode_t vp) 1628{ 1629 boolean_t need_dead_wakeup = FALSE; 1630 1631#if DIAGNOSTIC 1632 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); 1633#endif 1634 /* 1635 * if it is already on a list or non zero references return 1636 */ 1637 if (VONLIST(vp) || (vp->v_usecount != 0) || (vp->v_iocount != 0) || (vp->v_lflag & VL_TERMINATE)) 1638 return; 1639 1640 vnode_list_lock(); 1641 1642 if ((vp->v_flag & VRAGE) && !(vp->v_lflag & VL_DEAD)) { 1643 /* 1644 * add the new guy to the appropriate end of the RAGE list 1645 */ 1646 if ((vp->v_flag & VAGE)) 1647 TAILQ_INSERT_HEAD(&vnode_rage_list, vp, v_freelist); 1648 else 1649 TAILQ_INSERT_TAIL(&vnode_rage_list, vp, v_freelist); 1650 1651 vp->v_listflag |= VLIST_RAGE; 1652 ragevnodes++; 1653 1654 /* 1655 * reset the timestamp for the last inserted vp on the RAGE 1656 * queue to let new_vnode know that its not ok to start stealing 1657 * from this list... as long as we're actively adding to this list 1658 * we'll push out the vnodes we want to donate to the real free list 1659 * once we stop pushing, we'll let some time elapse before we start 1660 * stealing them in the new_vnode routine 1661 */ 1662 microuptime(&rage_tv); 1663 } else { 1664 /* 1665 * if VL_DEAD, insert it at head of the dead list 1666 * else insert at tail of LRU list or at head if VAGE is set 1667 */ 1668 if ( (vp->v_lflag & VL_DEAD)) { 1669 TAILQ_INSERT_HEAD(&vnode_dead_list, vp, v_freelist); 1670 vp->v_listflag |= VLIST_DEAD; 1671 deadvnodes++; 1672 1673 if (dead_vnode_wanted) { 1674 dead_vnode_wanted--; 1675 need_dead_wakeup = TRUE; 1676 } 1677 1678 } else if ( (vp->v_flag & VAGE) ) { 1679 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 1680 vp->v_flag &= ~VAGE; 1681 freevnodes++; 1682 } else { 1683 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1684 freevnodes++; 1685 } 1686 } 1687 vnode_list_unlock(); 1688 1689 if (need_dead_wakeup == TRUE) 1690 wakeup_one((caddr_t)&dead_vnode_wanted); 1691} 1692 1693 1694/* 1695 * remove the vnode from appropriate free list. 1696 * called with vnode LOCKED and 1697 * the list lock held 1698 */ 1699static void 1700vnode_list_remove_locked(vnode_t vp) 1701{ 1702 if (VONLIST(vp)) { 1703 /* 1704 * the v_listflag field is 1705 * protected by the vnode_list_lock 1706 */ 1707 if (vp->v_listflag & VLIST_RAGE) 1708 VREMRAGE("vnode_list_remove", vp); 1709 else if (vp->v_listflag & VLIST_DEAD) 1710 VREMDEAD("vnode_list_remove", vp); 1711 else if (vp->v_listflag & VLIST_ASYNC_WORK) 1712 VREMASYNC_WORK("vnode_list_remove", vp); 1713 else 1714 VREMFREE("vnode_list_remove", vp); 1715 } 1716} 1717 1718 1719/* 1720 * remove the vnode from appropriate free list. 1721 * called with vnode LOCKED 1722 */ 1723static void 1724vnode_list_remove(vnode_t vp) 1725{ 1726#if DIAGNOSTIC 1727 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); 1728#endif 1729 /* 1730 * we want to avoid taking the list lock 1731 * in the case where we're not on the free 1732 * list... this will be true for most 1733 * directories and any currently in use files 1734 * 1735 * we're guaranteed that we can't go from 1736 * the not-on-list state to the on-list 1737 * state since we hold the vnode lock... 1738 * all calls to vnode_list_add are done 1739 * under the vnode lock... so we can 1740 * check for that condition (the prevelant one) 1741 * without taking the list lock 1742 */ 1743 if (VONLIST(vp)) { 1744 vnode_list_lock(); 1745 /* 1746 * however, we're not guaranteed that 1747 * we won't go from the on-list state 1748 * to the not-on-list state until we 1749 * hold the vnode_list_lock... this 1750 * is due to "new_vnode" removing vnodes 1751 * from the free list uder the list_lock 1752 * w/o the vnode lock... so we need to 1753 * check again whether we're currently 1754 * on the free list 1755 */ 1756 vnode_list_remove_locked(vp); 1757 1758 vnode_list_unlock(); 1759 } 1760} 1761 1762 1763void 1764vnode_rele(vnode_t vp) 1765{ 1766 vnode_rele_internal(vp, 0, 0, 0); 1767} 1768 1769 1770void 1771vnode_rele_ext(vnode_t vp, int fmode, int dont_reenter) 1772{ 1773 vnode_rele_internal(vp, fmode, dont_reenter, 0); 1774} 1775 1776 1777void 1778vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked) 1779{ 1780 1781 if ( !locked) 1782 vnode_lock_spin(vp); 1783#if DIAGNOSTIC 1784 else 1785 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); 1786#endif 1787 if (--vp->v_usecount < 0) 1788 panic("vnode_rele_ext: vp %p usecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_usecount, vp->v_tag, vp->v_type, vp->v_flag); 1789 1790 if (fmode & FWRITE) { 1791 if (--vp->v_writecount < 0) 1792 panic("vnode_rele_ext: vp %p writecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_writecount, vp->v_tag, vp->v_type, vp->v_flag); 1793 } 1794 if (fmode & O_EVTONLY) { 1795 if (--vp->v_kusecount < 0) 1796 panic("vnode_rele_ext: vp %p kusecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_kusecount, vp->v_tag, vp->v_type, vp->v_flag); 1797 } 1798 if (vp->v_kusecount > vp->v_usecount) 1799 panic("vnode_rele_ext: vp %p kusecount(%d) out of balance with usecount(%d). v_tag = %d, v_type = %d, v_flag = %x.",vp, vp->v_kusecount, vp->v_usecount, vp->v_tag, vp->v_type, vp->v_flag); 1800 1801 if ((vp->v_iocount > 0) || (vp->v_usecount > 0)) { 1802 /* 1803 * vnode is still busy... if we're the last 1804 * usecount, mark for a future call to VNOP_INACTIVE 1805 * when the iocount finally drops to 0 1806 */ 1807 if (vp->v_usecount == 0) { 1808 vp->v_lflag |= VL_NEEDINACTIVE; 1809 vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT); 1810 } 1811 goto done; 1812 } 1813 vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT); 1814 1815 if ( (vp->v_lflag & (VL_TERMINATE | VL_DEAD)) || dont_reenter) { 1816 /* 1817 * vnode is being cleaned, or 1818 * we've requested that we don't reenter 1819 * the filesystem on this release... in 1820 * this case, we'll mark the vnode aged 1821 * if it's been marked for termination 1822 */ 1823 if (dont_reenter) { 1824 if ( !(vp->v_lflag & (VL_TERMINATE | VL_DEAD | VL_MARKTERM)) ) { 1825 vp->v_lflag |= VL_NEEDINACTIVE; 1826 1827 if (vnode_on_reliable_media(vp) == FALSE) { 1828 vnode_async_list_add(vp); 1829 goto done; 1830 } 1831 } 1832 vp->v_flag |= VAGE; 1833 } 1834 vnode_list_add(vp); 1835 1836 goto done; 1837 } 1838 /* 1839 * at this point both the iocount and usecount 1840 * are zero 1841 * pick up an iocount so that we can call 1842 * VNOP_INACTIVE with the vnode lock unheld 1843 */ 1844 vp->v_iocount++; 1845#ifdef JOE_DEBUG 1846 record_vp(vp, 1); 1847#endif 1848 vp->v_lflag &= ~VL_NEEDINACTIVE; 1849 vnode_unlock(vp); 1850 1851 VNOP_INACTIVE(vp, vfs_context_current()); 1852 1853 vnode_lock_spin(vp); 1854 /* 1855 * because we dropped the vnode lock to call VNOP_INACTIVE 1856 * the state of the vnode may have changed... we may have 1857 * picked up an iocount, usecount or the MARKTERM may have 1858 * been set... we need to reevaluate the reference counts 1859 * to determine if we can call vnode_reclaim_internal at 1860 * this point... if the reference counts are up, we'll pick 1861 * up the MARKTERM state when they get subsequently dropped 1862 */ 1863 if ( (vp->v_iocount == 1) && (vp->v_usecount == 0) && 1864 ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM)) { 1865 struct uthread *ut; 1866 1867 ut = get_bsdthread_info(current_thread()); 1868 1869 if (ut->uu_defer_reclaims) { 1870 vp->v_defer_reclaimlist = ut->uu_vreclaims; 1871 ut->uu_vreclaims = vp; 1872 goto done; 1873 } 1874 vnode_lock_convert(vp); 1875 vnode_reclaim_internal(vp, 1, 1, 0); 1876 } 1877 vnode_dropiocount(vp); 1878 vnode_list_add(vp); 1879done: 1880 if (vp->v_usecount == 0 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) { 1881 1882 if (vp->v_ubcinfo) { 1883 vnode_lock_convert(vp); 1884 memory_object_mark_unused(vp->v_ubcinfo->ui_control, (vp->v_flag & VRAGE) == VRAGE); 1885 } 1886 } 1887 if ( !locked) 1888 vnode_unlock(vp); 1889 return; 1890} 1891 1892/* 1893 * Remove any vnodes in the vnode table belonging to mount point mp. 1894 * 1895 * If MNT_NOFORCE is specified, there should not be any active ones, 1896 * return error if any are found (nb: this is a user error, not a 1897 * system error). If MNT_FORCE is specified, detach any active vnodes 1898 * that are found. 1899 */ 1900#if DIAGNOSTIC 1901int busyprt = 0; /* print out busy vnodes */ 1902#if 0 1903struct ctldebug debug1 = { "busyprt", &busyprt }; 1904#endif /* 0 */ 1905#endif 1906 1907int 1908vflush(struct mount *mp, struct vnode *skipvp, int flags) 1909{ 1910 struct vnode *vp; 1911 int busy = 0; 1912 int reclaimed = 0; 1913 int retval; 1914 unsigned int vid; 1915 1916 mount_lock(mp); 1917 vnode_iterate_setup(mp); 1918 /* 1919 * On regular unmounts(not forced) do a 1920 * quick check for vnodes to be in use. This 1921 * preserves the caching of vnodes. automounter 1922 * tries unmounting every so often to see whether 1923 * it is still busy or not. 1924 */ 1925 if (((flags & FORCECLOSE)==0) && ((mp->mnt_kern_flag & MNTK_UNMOUNT_PREFLIGHT) != 0)) { 1926 if (vnode_umount_preflight(mp, skipvp, flags)) { 1927 vnode_iterate_clear(mp); 1928 mount_unlock(mp); 1929 return(EBUSY); 1930 } 1931 } 1932loop: 1933 /* it is returns 0 then there is nothing to do */ 1934 retval = vnode_iterate_prepare(mp); 1935 1936 if (retval == 0) { 1937 vnode_iterate_clear(mp); 1938 mount_unlock(mp); 1939 return(retval); 1940 } 1941 1942 /* iterate over all the vnodes */ 1943 while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) { 1944 1945 vp = TAILQ_FIRST(&mp->mnt_workerqueue); 1946 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes); 1947 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes); 1948 1949 if ( (vp->v_mount != mp) || (vp == skipvp)) { 1950 continue; 1951 } 1952 vid = vp->v_id; 1953 mount_unlock(mp); 1954 1955 vnode_lock_spin(vp); 1956 1957 if ((vp->v_id != vid) || ((vp->v_lflag & (VL_DEAD | VL_TERMINATE)))) { 1958 vnode_unlock(vp); 1959 mount_lock(mp); 1960 continue; 1961 } 1962 1963 /* 1964 * If requested, skip over vnodes marked VSYSTEM. 1965 * Skip over all vnodes marked VNOFLUSH. 1966 */ 1967 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || 1968 (vp->v_flag & VNOFLUSH))) { 1969 vnode_unlock(vp); 1970 mount_lock(mp); 1971 continue; 1972 } 1973 /* 1974 * If requested, skip over vnodes marked VSWAP. 1975 */ 1976 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) { 1977 vnode_unlock(vp); 1978 mount_lock(mp); 1979 continue; 1980 } 1981 /* 1982 * If requested, skip over vnodes marked VROOT. 1983 */ 1984 if ((flags & SKIPROOT) && (vp->v_flag & VROOT)) { 1985 vnode_unlock(vp); 1986 mount_lock(mp); 1987 continue; 1988 } 1989 /* 1990 * If WRITECLOSE is set, only flush out regular file 1991 * vnodes open for writing. 1992 */ 1993 if ((flags & WRITECLOSE) && 1994 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1995 vnode_unlock(vp); 1996 mount_lock(mp); 1997 continue; 1998 } 1999 /* 2000 * If the real usecount is 0, all we need to do is clear 2001 * out the vnode data structures and we are done. 2002 */ 2003 if (((vp->v_usecount == 0) || 2004 ((vp->v_usecount - vp->v_kusecount) == 0))) { 2005 2006 vnode_lock_convert(vp); 2007 vp->v_iocount++; /* so that drain waits for * other iocounts */ 2008#ifdef JOE_DEBUG 2009 record_vp(vp, 1); 2010#endif 2011 vnode_reclaim_internal(vp, 1, 1, 0); 2012 vnode_dropiocount(vp); 2013 vnode_list_add(vp); 2014 vnode_unlock(vp); 2015 2016 reclaimed++; 2017 mount_lock(mp); 2018 continue; 2019 } 2020 /* 2021 * If FORCECLOSE is set, forcibly close the vnode. 2022 * For block or character devices, revert to an 2023 * anonymous device. For all other files, just kill them. 2024 */ 2025 if (flags & FORCECLOSE) { 2026 vnode_lock_convert(vp); 2027 2028 if (vp->v_type != VBLK && vp->v_type != VCHR) { 2029 vp->v_iocount++; /* so that drain waits * for other iocounts */ 2030#ifdef JOE_DEBUG 2031 record_vp(vp, 1); 2032#endif 2033 vnode_abort_advlocks(vp); 2034 vnode_reclaim_internal(vp, 1, 1, 0); 2035 vnode_dropiocount(vp); 2036 vnode_list_add(vp); 2037 vnode_unlock(vp); 2038 } else { 2039 vclean(vp, 0); 2040 vp->v_lflag &= ~VL_DEAD; 2041 vp->v_op = spec_vnodeop_p; 2042 vp->v_flag |= VDEVFLUSH; 2043 vnode_unlock(vp); 2044 } 2045 mount_lock(mp); 2046 continue; 2047 } 2048#if DIAGNOSTIC 2049 if (busyprt) 2050 vprint("vflush: busy vnode", vp); 2051#endif 2052 vnode_unlock(vp); 2053 mount_lock(mp); 2054 busy++; 2055 } 2056 2057 /* At this point the worker queue is completed */ 2058 if (busy && ((flags & FORCECLOSE)==0) && reclaimed) { 2059 busy = 0; 2060 reclaimed = 0; 2061 (void)vnode_iterate_reloadq(mp); 2062 /* returned with mount lock held */ 2063 goto loop; 2064 } 2065 2066 /* if new vnodes were created in between retry the reclaim */ 2067 if ( vnode_iterate_reloadq(mp) != 0) { 2068 if (!(busy && ((flags & FORCECLOSE)==0))) 2069 goto loop; 2070 } 2071 vnode_iterate_clear(mp); 2072 mount_unlock(mp); 2073 2074 if (busy && ((flags & FORCECLOSE)==0)) 2075 return (EBUSY); 2076 return (0); 2077} 2078 2079long num_recycledvnodes = 0; 2080/* 2081 * Disassociate the underlying file system from a vnode. 2082 * The vnode lock is held on entry. 2083 */ 2084static void 2085vclean(vnode_t vp, int flags) 2086{ 2087 vfs_context_t ctx = vfs_context_current(); 2088 int active; 2089 int need_inactive; 2090 int already_terminating; 2091 int clflags = 0; 2092#if NAMEDSTREAMS 2093 int is_namedstream; 2094#endif 2095 2096 /* 2097 * Check to see if the vnode is in use. 2098 * If so we have to reference it before we clean it out 2099 * so that its count cannot fall to zero and generate a 2100 * race against ourselves to recycle it. 2101 */ 2102 active = vp->v_usecount; 2103 2104 /* 2105 * just in case we missed sending a needed 2106 * VNOP_INACTIVE, we'll do it now 2107 */ 2108 need_inactive = (vp->v_lflag & VL_NEEDINACTIVE); 2109 2110 vp->v_lflag &= ~VL_NEEDINACTIVE; 2111 2112 /* 2113 * Prevent the vnode from being recycled or 2114 * brought into use while we clean it out. 2115 */ 2116 already_terminating = (vp->v_lflag & VL_TERMINATE); 2117 2118 vp->v_lflag |= VL_TERMINATE; 2119 2120 /* 2121 * remove the vnode from any mount list 2122 * it might be on... 2123 */ 2124 insmntque(vp, (struct mount *)0); 2125 2126#if NAMEDSTREAMS 2127 is_namedstream = vnode_isnamedstream(vp); 2128#endif 2129 2130 vnode_unlock(vp); 2131 2132 OSAddAtomicLong(1, &num_recycledvnodes); 2133 2134 if (flags & DOCLOSE) 2135 clflags |= IO_NDELAY; 2136 if (flags & REVOKEALL) 2137 clflags |= IO_REVOKE; 2138 2139 if (active && (flags & DOCLOSE)) 2140 VNOP_CLOSE(vp, clflags, ctx); 2141 2142 /* 2143 * Clean out any buffers associated with the vnode. 2144 */ 2145 if (flags & DOCLOSE) { 2146#if NFSCLIENT 2147 if (vp->v_tag == VT_NFS) 2148 nfs_vinvalbuf(vp, V_SAVE, ctx, 0); 2149 else 2150#endif 2151 { 2152 VNOP_FSYNC(vp, MNT_WAIT, ctx); 2153 buf_invalidateblks(vp, BUF_WRITE_DATA | BUF_INVALIDATE_LOCKED, 0, 0); 2154 } 2155 if (UBCINFOEXISTS(vp)) 2156 /* 2157 * Clean the pages in VM. 2158 */ 2159 (void)ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL, UBC_PUSHALL | UBC_INVALIDATE | UBC_SYNC); 2160 } 2161 if (active || need_inactive) 2162 VNOP_INACTIVE(vp, ctx); 2163 2164#if NAMEDSTREAMS 2165 if ((is_namedstream != 0) && (vp->v_parent != NULLVP)) { 2166 vnode_t pvp = vp->v_parent; 2167 2168 /* Delete the shadow stream file before we reclaim its vnode */ 2169 if (vnode_isshadow(vp)) { 2170 vnode_relenamedstream(pvp, vp); 2171 } 2172 2173 /* 2174 * No more streams associated with the parent. We 2175 * have a ref on it, so its identity is stable. 2176 * If the parent is on an opaque volume, then we need to know 2177 * whether it has associated named streams. 2178 */ 2179 if (vfs_authopaque(pvp->v_mount)) { 2180 vnode_lock_spin(pvp); 2181 pvp->v_lflag &= ~VL_HASSTREAMS; 2182 vnode_unlock(pvp); 2183 } 2184 } 2185#endif 2186 2187 /* 2188 * Destroy ubc named reference 2189 * cluster_release is done on this path 2190 * along with dropping the reference on the ucred 2191 */ 2192 ubc_destroy_named(vp); 2193 2194#if CONFIG_TRIGGERS 2195 /* 2196 * cleanup trigger info from vnode (if any) 2197 */ 2198 if (vp->v_resolve) 2199 vnode_resolver_detach(vp); 2200#endif 2201 2202 /* 2203 * Reclaim the vnode. 2204 */ 2205 if (VNOP_RECLAIM(vp, ctx)) 2206 panic("vclean: cannot reclaim"); 2207 2208 // make sure the name & parent ptrs get cleaned out! 2209 vnode_update_identity(vp, NULLVP, NULL, 0, 0, VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME | VNODE_UPDATE_PURGE); 2210 2211 vnode_lock(vp); 2212 2213 vp->v_mount = dead_mountp; 2214 vp->v_op = dead_vnodeop_p; 2215 vp->v_tag = VT_NON; 2216 vp->v_data = NULL; 2217 2218 vp->v_lflag |= VL_DEAD; 2219 2220 if (already_terminating == 0) { 2221 vp->v_lflag &= ~VL_TERMINATE; 2222 /* 2223 * Done with purge, notify sleepers of the grim news. 2224 */ 2225 if (vp->v_lflag & VL_TERMWANT) { 2226 vp->v_lflag &= ~VL_TERMWANT; 2227 wakeup(&vp->v_lflag); 2228 } 2229 } 2230} 2231 2232/* 2233 * Eliminate all activity associated with the requested vnode 2234 * and with all vnodes aliased to the requested vnode. 2235 */ 2236int 2237#if DIAGNOSTIC 2238vn_revoke(vnode_t vp, int flags, __unused vfs_context_t a_context) 2239#else 2240vn_revoke(vnode_t vp, __unused int flags, __unused vfs_context_t a_context) 2241#endif 2242{ 2243 struct vnode *vq; 2244 int vid; 2245 2246#if DIAGNOSTIC 2247 if ((flags & REVOKEALL) == 0) 2248 panic("vnop_revoke"); 2249#endif 2250 2251 if (vnode_isaliased(vp)) { 2252 /* 2253 * If a vgone (or vclean) is already in progress, 2254 * return an immediate error 2255 */ 2256 if (vp->v_lflag & VL_TERMINATE) 2257 return(ENOENT); 2258 2259 /* 2260 * Ensure that vp will not be vgone'd while we 2261 * are eliminating its aliases. 2262 */ 2263 SPECHASH_LOCK(); 2264 while ((vp->v_specflags & SI_ALIASED)) { 2265 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 2266 if (vq->v_rdev != vp->v_rdev || 2267 vq->v_type != vp->v_type || vp == vq) 2268 continue; 2269 vid = vq->v_id; 2270 SPECHASH_UNLOCK(); 2271 if (vnode_getwithvid(vq,vid)){ 2272 SPECHASH_LOCK(); 2273 break; 2274 } 2275 vnode_reclaim_internal(vq, 0, 1, 0); 2276 vnode_put(vq); 2277 SPECHASH_LOCK(); 2278 break; 2279 } 2280 } 2281 SPECHASH_UNLOCK(); 2282 } 2283 vnode_reclaim_internal(vp, 0, 0, REVOKEALL); 2284 2285 return (0); 2286} 2287 2288/* 2289 * Recycle an unused vnode to the front of the free list. 2290 * Release the passed interlock if the vnode will be recycled. 2291 */ 2292int 2293vnode_recycle(struct vnode *vp) 2294{ 2295 vnode_lock_spin(vp); 2296 2297 if (vp->v_iocount || vp->v_usecount) { 2298 vp->v_lflag |= VL_MARKTERM; 2299 vnode_unlock(vp); 2300 return(0); 2301 } 2302 vnode_lock_convert(vp); 2303 vnode_reclaim_internal(vp, 1, 0, 0); 2304 2305 vnode_unlock(vp); 2306 2307 return (1); 2308} 2309 2310static int 2311vnode_reload(vnode_t vp) 2312{ 2313 vnode_lock_spin(vp); 2314 2315 if ((vp->v_iocount > 1) || vp->v_usecount) { 2316 vnode_unlock(vp); 2317 return(0); 2318 } 2319 if (vp->v_iocount <= 0) 2320 panic("vnode_reload with no iocount %d", vp->v_iocount); 2321 2322 /* mark for release when iocount is dopped */ 2323 vp->v_lflag |= VL_MARKTERM; 2324 vnode_unlock(vp); 2325 2326 return (1); 2327} 2328 2329 2330static void 2331vgone(vnode_t vp, int flags) 2332{ 2333 struct vnode *vq; 2334 struct vnode *vx; 2335 2336 /* 2337 * Clean out the filesystem specific data. 2338 * vclean also takes care of removing the 2339 * vnode from any mount list it might be on 2340 */ 2341 vclean(vp, flags | DOCLOSE); 2342 2343 /* 2344 * If special device, remove it from special device alias list 2345 * if it is on one. 2346 */ 2347 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) { 2348 SPECHASH_LOCK(); 2349 if (*vp->v_hashchain == vp) { 2350 *vp->v_hashchain = vp->v_specnext; 2351 } else { 2352 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 2353 if (vq->v_specnext != vp) 2354 continue; 2355 vq->v_specnext = vp->v_specnext; 2356 break; 2357 } 2358 if (vq == NULL) 2359 panic("missing bdev"); 2360 } 2361 if (vp->v_specflags & SI_ALIASED) { 2362 vx = NULL; 2363 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 2364 if (vq->v_rdev != vp->v_rdev || 2365 vq->v_type != vp->v_type) 2366 continue; 2367 if (vx) 2368 break; 2369 vx = vq; 2370 } 2371 if (vx == NULL) 2372 panic("missing alias"); 2373 if (vq == NULL) 2374 vx->v_specflags &= ~SI_ALIASED; 2375 vp->v_specflags &= ~SI_ALIASED; 2376 } 2377 SPECHASH_UNLOCK(); 2378 { 2379 struct specinfo *tmp = vp->v_specinfo; 2380 vp->v_specinfo = NULL; 2381 FREE_ZONE((void *)tmp, sizeof(struct specinfo), M_SPECINFO); 2382 } 2383 } 2384} 2385 2386/* 2387 * Lookup a vnode by device number. 2388 */ 2389int 2390check_mountedon(dev_t dev, enum vtype type, int *errorp) 2391{ 2392 vnode_t vp; 2393 int rc = 0; 2394 int vid; 2395 2396loop: 2397 SPECHASH_LOCK(); 2398 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 2399 if (dev != vp->v_rdev || type != vp->v_type) 2400 continue; 2401 vid = vp->v_id; 2402 SPECHASH_UNLOCK(); 2403 if (vnode_getwithvid(vp,vid)) 2404 goto loop; 2405 vnode_lock_spin(vp); 2406 if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) { 2407 vnode_unlock(vp); 2408 if ((*errorp = vfs_mountedon(vp)) != 0) 2409 rc = 1; 2410 } else 2411 vnode_unlock(vp); 2412 vnode_put(vp); 2413 return(rc); 2414 } 2415 SPECHASH_UNLOCK(); 2416 return (0); 2417} 2418 2419/* 2420 * Calculate the total number of references to a special device. 2421 */ 2422int 2423vcount(vnode_t vp) 2424{ 2425 vnode_t vq, vnext; 2426 int count; 2427 int vid; 2428 2429loop: 2430 if (!vnode_isaliased(vp)) 2431 return (vp->v_specinfo->si_opencount); 2432 count = 0; 2433 2434 SPECHASH_LOCK(); 2435 /* 2436 * Grab first vnode and its vid. 2437 */ 2438 vq = *vp->v_hashchain; 2439 vid = vq ? vq->v_id : 0; 2440 2441 SPECHASH_UNLOCK(); 2442 2443 while (vq) { 2444 /* 2445 * Attempt to get the vnode outside the SPECHASH lock. 2446 */ 2447 if (vnode_getwithvid(vq, vid)) { 2448 goto loop; 2449 } 2450 vnode_lock(vq); 2451 2452 if (vq->v_rdev == vp->v_rdev && vq->v_type == vp->v_type) { 2453 if ((vq->v_usecount == 0) && (vq->v_iocount == 1) && vq != vp) { 2454 /* 2455 * Alias, but not in use, so flush it out. 2456 */ 2457 vnode_reclaim_internal(vq, 1, 1, 0); 2458 vnode_put_locked(vq); 2459 vnode_unlock(vq); 2460 goto loop; 2461 } 2462 count += vq->v_specinfo->si_opencount; 2463 } 2464 vnode_unlock(vq); 2465 2466 SPECHASH_LOCK(); 2467 /* 2468 * must do this with the reference still held on 'vq' 2469 * so that it can't be destroyed while we're poking 2470 * through v_specnext 2471 */ 2472 vnext = vq->v_specnext; 2473 vid = vnext ? vnext->v_id : 0; 2474 2475 SPECHASH_UNLOCK(); 2476 2477 vnode_put(vq); 2478 2479 vq = vnext; 2480 } 2481 2482 return (count); 2483} 2484 2485int prtactive = 0; /* 1 => print out reclaim of active vnodes */ 2486 2487/* 2488 * Print out a description of a vnode. 2489 */ 2490static const char *typename[] = 2491 { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" }; 2492 2493void 2494vprint(const char *label, struct vnode *vp) 2495{ 2496 char sbuf[64]; 2497 2498 if (label != NULL) 2499 printf("%s: ", label); 2500 printf("type %s, usecount %d, writecount %d", 2501 typename[vp->v_type], vp->v_usecount, vp->v_writecount); 2502 sbuf[0] = '\0'; 2503 if (vp->v_flag & VROOT) 2504 strlcat(sbuf, "|VROOT", sizeof(sbuf)); 2505 if (vp->v_flag & VTEXT) 2506 strlcat(sbuf, "|VTEXT", sizeof(sbuf)); 2507 if (vp->v_flag & VSYSTEM) 2508 strlcat(sbuf, "|VSYSTEM", sizeof(sbuf)); 2509 if (vp->v_flag & VNOFLUSH) 2510 strlcat(sbuf, "|VNOFLUSH", sizeof(sbuf)); 2511 if (vp->v_flag & VBWAIT) 2512 strlcat(sbuf, "|VBWAIT", sizeof(sbuf)); 2513 if (vnode_isaliased(vp)) 2514 strlcat(sbuf, "|VALIASED", sizeof(sbuf)); 2515 if (sbuf[0] != '\0') 2516 printf(" flags (%s)", &sbuf[1]); 2517} 2518 2519 2520int 2521vn_getpath(struct vnode *vp, char *pathbuf, int *len) 2522{ 2523 return build_path(vp, pathbuf, *len, len, BUILDPATH_NO_FS_ENTER, vfs_context_current()); 2524} 2525 2526int 2527vn_getpath_fsenter(struct vnode *vp, char *pathbuf, int *len) 2528{ 2529 return build_path(vp, pathbuf, *len, len, 0, vfs_context_current()); 2530} 2531 2532int 2533vn_getcdhash(struct vnode *vp, off_t offset, unsigned char *cdhash) 2534{ 2535 return ubc_cs_getcdhash(vp, offset, cdhash); 2536} 2537 2538 2539static char *extension_table=NULL; 2540static int nexts; 2541static int max_ext_width; 2542 2543static int 2544extension_cmp(const void *a, const void *b) 2545{ 2546 return (strlen((const char *)a) - strlen((const char *)b)); 2547} 2548 2549 2550// 2551// This is the api LaunchServices uses to inform the kernel 2552// the list of package extensions to ignore. 2553// 2554// Internally we keep the list sorted by the length of the 2555// the extension (from longest to shortest). We sort the 2556// list of extensions so that we can speed up our searches 2557// when comparing file names -- we only compare extensions 2558// that could possibly fit into the file name, not all of 2559// them (i.e. a short 8 character name can't have an 8 2560// character extension). 2561// 2562extern lck_mtx_t *pkg_extensions_lck; 2563 2564__private_extern__ int 2565set_package_extensions_table(user_addr_t data, int nentries, int maxwidth) 2566{ 2567 char *new_exts, *old_exts; 2568 int error; 2569 2570 if (nentries <= 0 || nentries > 1024 || maxwidth <= 0 || maxwidth > 255) { 2571 return EINVAL; 2572 } 2573 2574 2575 // allocate one byte extra so we can guarantee null termination 2576 MALLOC(new_exts, char *, (nentries * maxwidth) + 1, M_TEMP, M_WAITOK); 2577 if (new_exts == NULL) { 2578 return ENOMEM; 2579 } 2580 2581 error = copyin(data, new_exts, nentries * maxwidth); 2582 if (error) { 2583 FREE(new_exts, M_TEMP); 2584 return error; 2585 } 2586 2587 new_exts[(nentries * maxwidth)] = '\0'; // guarantee null termination of the block 2588 2589 qsort(new_exts, nentries, maxwidth, extension_cmp); 2590 2591 lck_mtx_lock(pkg_extensions_lck); 2592 2593 old_exts = extension_table; 2594 extension_table = new_exts; 2595 nexts = nentries; 2596 max_ext_width = maxwidth; 2597 2598 lck_mtx_unlock(pkg_extensions_lck); 2599 2600 if (old_exts) { 2601 FREE(old_exts, M_TEMP); 2602 } 2603 2604 return 0; 2605} 2606 2607 2608__private_extern__ int 2609is_package_name(const char *name, int len) 2610{ 2611 int i, extlen; 2612 const char *ptr, *name_ext; 2613 2614 if (len <= 3) { 2615 return 0; 2616 } 2617 2618 name_ext = NULL; 2619 for(ptr=name; *ptr != '\0'; ptr++) { 2620 if (*ptr == '.') { 2621 name_ext = ptr; 2622 } 2623 } 2624 2625 // if there is no "." extension, it can't match 2626 if (name_ext == NULL) { 2627 return 0; 2628 } 2629 2630 // advance over the "." 2631 name_ext++; 2632 2633 lck_mtx_lock(pkg_extensions_lck); 2634 2635 // now iterate over all the extensions to see if any match 2636 ptr = &extension_table[0]; 2637 for(i=0; i < nexts; i++, ptr+=max_ext_width) { 2638 extlen = strlen(ptr); 2639 if (strncasecmp(name_ext, ptr, extlen) == 0 && name_ext[extlen] == '\0') { 2640 // aha, a match! 2641 lck_mtx_unlock(pkg_extensions_lck); 2642 return 1; 2643 } 2644 } 2645 2646 lck_mtx_unlock(pkg_extensions_lck); 2647 2648 // if we get here, no extension matched 2649 return 0; 2650} 2651 2652int 2653vn_path_package_check(__unused vnode_t vp, char *path, int pathlen, int *component) 2654{ 2655 char *ptr, *end; 2656 int comp=0; 2657 2658 *component = -1; 2659 if (*path != '/') { 2660 return EINVAL; 2661 } 2662 2663 end = path + 1; 2664 while(end < path + pathlen && *end != '\0') { 2665 while(end < path + pathlen && *end == '/' && *end != '\0') { 2666 end++; 2667 } 2668 2669 ptr = end; 2670 2671 while(end < path + pathlen && *end != '/' && *end != '\0') { 2672 end++; 2673 } 2674 2675 if (end > path + pathlen) { 2676 // hmm, string wasn't null terminated 2677 return EINVAL; 2678 } 2679 2680 *end = '\0'; 2681 if (is_package_name(ptr, end - ptr)) { 2682 *component = comp; 2683 break; 2684 } 2685 2686 end++; 2687 comp++; 2688 } 2689 2690 return 0; 2691} 2692 2693/* 2694 * Determine if a name is inappropriate for a searchfs query. 2695 * This list consists of /System currently. 2696 */ 2697 2698int vn_searchfs_inappropriate_name(const char *name, int len) { 2699 const char *bad_names[] = { "System" }; 2700 int bad_len[] = { 6 }; 2701 int i; 2702 2703 for(i=0; i < (int) (sizeof(bad_names) / sizeof(bad_names[0])); i++) { 2704 if (len == bad_len[i] && strncmp(name, bad_names[i], strlen(bad_names[i]) + 1) == 0) { 2705 return 1; 2706 } 2707 } 2708 2709 // if we get here, no name matched 2710 return 0; 2711} 2712 2713/* 2714 * Top level filesystem related information gathering. 2715 */ 2716extern unsigned int vfs_nummntops; 2717 2718int 2719vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, 2720 user_addr_t newp, size_t newlen, proc_t p) 2721{ 2722 struct vfstable *vfsp; 2723 int *username; 2724 u_int usernamelen; 2725 int error; 2726 struct vfsconf vfsc; 2727 2728 if (namelen > CTL_MAXNAME) { 2729 return (EINVAL); 2730 } 2731 2732 /* All non VFS_GENERIC and in VFS_GENERIC, 2733 * VFS_MAXTYPENUM, VFS_CONF, VFS_SET_PACKAGE_EXTS 2734 * needs to have root priv to have modifiers. 2735 * For rest the userland_sysctl(CTLFLAG_ANYBODY) would cover. 2736 */ 2737 if ((newp != USER_ADDR_NULL) && ((name[0] != VFS_GENERIC) || 2738 ((name[1] == VFS_MAXTYPENUM) || 2739 (name[1] == VFS_CONF) || 2740 (name[1] == VFS_SET_PACKAGE_EXTS))) 2741 && (error = suser(kauth_cred_get(), &p->p_acflag))) { 2742 return(error); 2743 } 2744 /* 2745 * The VFS_NUMMNTOPS shouldn't be at name[0] since 2746 * is a VFS generic variable. So now we must check 2747 * namelen so we don't end up covering any UFS 2748 * variables (sinc UFS vfc_typenum is 1). 2749 * 2750 * It should have been: 2751 * name[0]: VFS_GENERIC 2752 * name[1]: VFS_NUMMNTOPS 2753 */ 2754 if (namelen == 1 && name[0] == VFS_NUMMNTOPS) { 2755 return (sysctl_rdint(oldp, oldlenp, newp, vfs_nummntops)); 2756 } 2757 2758 /* all sysctl names at this level are at least name and field */ 2759 if (namelen < 2) 2760 return (EISDIR); /* overloaded */ 2761 if (name[0] != VFS_GENERIC) { 2762 2763 mount_list_lock(); 2764 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2765 if (vfsp->vfc_typenum == name[0]) { 2766 vfsp->vfc_refcount++; 2767 break; 2768 } 2769 mount_list_unlock(); 2770 2771 if (vfsp == NULL) 2772 return (ENOTSUP); 2773 2774 /* XXX current context proxy for proc p? */ 2775 error = ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 2776 oldp, oldlenp, newp, newlen, 2777 vfs_context_current())); 2778 2779 mount_list_lock(); 2780 vfsp->vfc_refcount--; 2781 mount_list_unlock(); 2782 return error; 2783 } 2784 switch (name[1]) { 2785 case VFS_MAXTYPENUM: 2786 return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf)); 2787 case VFS_CONF: 2788 if (namelen < 3) 2789 return (ENOTDIR); /* overloaded */ 2790 2791 mount_list_lock(); 2792 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2793 if (vfsp->vfc_typenum == name[2]) 2794 break; 2795 2796 if (vfsp == NULL) { 2797 mount_list_unlock(); 2798 return (ENOTSUP); 2799 } 2800 2801 vfsc.vfc_reserved1 = 0; 2802 bcopy(vfsp->vfc_name, vfsc.vfc_name, sizeof(vfsc.vfc_name)); 2803 vfsc.vfc_typenum = vfsp->vfc_typenum; 2804 vfsc.vfc_refcount = vfsp->vfc_refcount; 2805 vfsc.vfc_flags = vfsp->vfc_flags; 2806 vfsc.vfc_reserved2 = 0; 2807 vfsc.vfc_reserved3 = 0; 2808 2809 mount_list_unlock(); 2810 return (sysctl_rdstruct(oldp, oldlenp, newp, &vfsc, 2811 sizeof(struct vfsconf))); 2812 2813 case VFS_SET_PACKAGE_EXTS: 2814 return set_package_extensions_table((user_addr_t)((unsigned)name[1]), name[2], name[3]); 2815 } 2816 /* 2817 * We need to get back into the general MIB, so we need to re-prepend 2818 * CTL_VFS to our name and try userland_sysctl(). 2819 */ 2820 2821 usernamelen = namelen + 1; 2822 MALLOC(username, int *, usernamelen * sizeof(*username), 2823 M_TEMP, M_WAITOK); 2824 bcopy(name, username + 1, namelen * sizeof(*name)); 2825 username[0] = CTL_VFS; 2826 error = userland_sysctl(p, username, usernamelen, oldp, 2827 oldlenp, newp, newlen, oldlenp); 2828 FREE(username, M_TEMP); 2829 return (error); 2830} 2831 2832/* 2833 * Dump vnode list (via sysctl) - defunct 2834 * use "pstat" instead 2835 */ 2836/* ARGSUSED */ 2837int 2838sysctl_vnode 2839(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, __unused struct sysctl_req *req) 2840{ 2841 return(EINVAL); 2842} 2843 2844SYSCTL_PROC(_kern, KERN_VNODE, vnode, 2845 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 2846 0, 0, sysctl_vnode, "S,", ""); 2847 2848 2849/* 2850 * Check to see if a filesystem is mounted on a block device. 2851 */ 2852int 2853vfs_mountedon(struct vnode *vp) 2854{ 2855 struct vnode *vq; 2856 int error = 0; 2857 2858 SPECHASH_LOCK(); 2859 if (vp->v_specflags & SI_MOUNTEDON) { 2860 error = EBUSY; 2861 goto out; 2862 } 2863 if (vp->v_specflags & SI_ALIASED) { 2864 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 2865 if (vq->v_rdev != vp->v_rdev || 2866 vq->v_type != vp->v_type) 2867 continue; 2868 if (vq->v_specflags & SI_MOUNTEDON) { 2869 error = EBUSY; 2870 break; 2871 } 2872 } 2873 } 2874out: 2875 SPECHASH_UNLOCK(); 2876 return (error); 2877} 2878 2879/* 2880 * Unmount all filesystems. The list is traversed in reverse order 2881 * of mounting to avoid dependencies. 2882 */ 2883__private_extern__ void 2884vfs_unmountall(void) 2885{ 2886 struct mount *mp; 2887 int error; 2888 2889 /* 2890 * Since this only runs when rebooting, it is not interlocked. 2891 */ 2892 mount_list_lock(); 2893 while(!TAILQ_EMPTY(&mountlist)) { 2894 mp = TAILQ_LAST(&mountlist, mntlist); 2895 mount_list_unlock(); 2896 error = dounmount(mp, MNT_FORCE, 0, vfs_context_current()); 2897 if ((error != 0) && (error != EBUSY)) { 2898 printf("unmount of %s failed (", mp->mnt_vfsstat.f_mntonname); 2899 printf("%d)\n", error); 2900 mount_list_lock(); 2901 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2902 continue; 2903 } else if (error == EBUSY) { 2904 /* If EBUSY is returned, the unmount was already in progress */ 2905 printf("unmount of %p failed (", mp); 2906 printf("BUSY)\n"); 2907 } 2908 mount_list_lock(); 2909 } 2910 mount_list_unlock(); 2911} 2912 2913 2914/* 2915 * This routine is called from vnode_pager_deallocate out of the VM 2916 * The path to vnode_pager_deallocate can only be initiated by ubc_destroy_named 2917 * on a vnode that has a UBCINFO 2918 */ 2919__private_extern__ void 2920vnode_pager_vrele(vnode_t vp) 2921{ 2922 struct ubc_info *uip; 2923 2924 vnode_lock_spin(vp); 2925 2926 vp->v_lflag &= ~VNAMED_UBC; 2927 2928 uip = vp->v_ubcinfo; 2929 vp->v_ubcinfo = UBC_INFO_NULL; 2930 2931 vnode_unlock(vp); 2932 2933 ubc_info_deallocate(uip); 2934} 2935 2936 2937#include <sys/disk.h> 2938 2939u_int32_t rootunit = (u_int32_t)-1; 2940 2941errno_t 2942vfs_init_io_attributes(vnode_t devvp, mount_t mp) 2943{ 2944 int error; 2945 off_t readblockcnt = 0; 2946 off_t writeblockcnt = 0; 2947 off_t readmaxcnt = 0; 2948 off_t writemaxcnt = 0; 2949 off_t readsegcnt = 0; 2950 off_t writesegcnt = 0; 2951 off_t readsegsize = 0; 2952 off_t writesegsize = 0; 2953 off_t alignment = 0; 2954 off_t ioqueue_depth = 0; 2955 u_int32_t blksize; 2956 u_int64_t temp; 2957 u_int32_t features; 2958 vfs_context_t ctx = vfs_context_current(); 2959 int isssd = 0; 2960 int isvirtual = 0; 2961 2962 2963 VNOP_IOCTL(devvp, DKIOCGETTHROTTLEMASK, (caddr_t)&mp->mnt_throttle_mask, 0, NULL); 2964 /* 2965 * as a reasonable approximation, only use the lowest bit of the mask 2966 * to generate a disk unit number 2967 */ 2968 mp->mnt_devbsdunit = num_trailing_0(mp->mnt_throttle_mask); 2969 2970 if (devvp == rootvp) 2971 rootunit = mp->mnt_devbsdunit; 2972 2973 if (mp->mnt_devbsdunit == rootunit) { 2974 /* 2975 * this mount point exists on the same device as the root 2976 * partition, so it comes under the hard throttle control... 2977 * this is true even for the root mount point itself 2978 */ 2979 mp->mnt_kern_flag |= MNTK_ROOTDEV; 2980 } 2981 /* 2982 * force the spec device to re-cache 2983 * the underlying block size in case 2984 * the filesystem overrode the initial value 2985 */ 2986 set_fsblocksize(devvp); 2987 2988 2989 if ((error = VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, 2990 (caddr_t)&blksize, 0, ctx))) 2991 return (error); 2992 2993 mp->mnt_devblocksize = blksize; 2994 2995 /* 2996 * set the maximum possible I/O size 2997 * this may get clipped to a smaller value 2998 * based on which constraints are being advertised 2999 * and if those advertised constraints result in a smaller 3000 * limit for a given I/O 3001 */ 3002 mp->mnt_maxreadcnt = MAX_UPL_SIZE * PAGE_SIZE; 3003 mp->mnt_maxwritecnt = MAX_UPL_SIZE * PAGE_SIZE; 3004 3005 if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, ctx) == 0) { 3006 if (isvirtual) 3007 mp->mnt_kern_flag |= MNTK_VIRTUALDEV; 3008 } 3009 if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ctx) == 0) { 3010 if (isssd) 3011 mp->mnt_kern_flag |= MNTK_SSD; 3012 } 3013 if ((error = VNOP_IOCTL(devvp, DKIOCGETFEATURES, 3014 (caddr_t)&features, 0, ctx))) 3015 return (error); 3016 3017 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD, 3018 (caddr_t)&readblockcnt, 0, ctx))) 3019 return (error); 3020 3021 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE, 3022 (caddr_t)&writeblockcnt, 0, ctx))) 3023 return (error); 3024 3025 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD, 3026 (caddr_t)&readmaxcnt, 0, ctx))) 3027 return (error); 3028 3029 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE, 3030 (caddr_t)&writemaxcnt, 0, ctx))) 3031 return (error); 3032 3033 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD, 3034 (caddr_t)&readsegcnt, 0, ctx))) 3035 return (error); 3036 3037 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE, 3038 (caddr_t)&writesegcnt, 0, ctx))) 3039 return (error); 3040 3041 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD, 3042 (caddr_t)&readsegsize, 0, ctx))) 3043 return (error); 3044 3045 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE, 3046 (caddr_t)&writesegsize, 0, ctx))) 3047 return (error); 3048 3049 if ((error = VNOP_IOCTL(devvp, DKIOCGETMINSEGMENTALIGNMENTBYTECOUNT, 3050 (caddr_t)&alignment, 0, ctx))) 3051 return (error); 3052 3053 if ((error = VNOP_IOCTL(devvp, DKIOCGETCOMMANDPOOLSIZE, 3054 (caddr_t)&ioqueue_depth, 0, ctx))) 3055 return (error); 3056 3057 if (readmaxcnt) 3058 mp->mnt_maxreadcnt = (readmaxcnt > UINT32_MAX) ? UINT32_MAX : readmaxcnt; 3059 3060 if (readblockcnt) { 3061 temp = readblockcnt * blksize; 3062 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp; 3063 3064 if (temp < mp->mnt_maxreadcnt) 3065 mp->mnt_maxreadcnt = (u_int32_t)temp; 3066 } 3067 3068 if (writemaxcnt) 3069 mp->mnt_maxwritecnt = (writemaxcnt > UINT32_MAX) ? UINT32_MAX : writemaxcnt; 3070 3071 if (writeblockcnt) { 3072 temp = writeblockcnt * blksize; 3073 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp; 3074 3075 if (temp < mp->mnt_maxwritecnt) 3076 mp->mnt_maxwritecnt = (u_int32_t)temp; 3077 } 3078 3079 if (readsegcnt) { 3080 temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt; 3081 } else { 3082 temp = mp->mnt_maxreadcnt / PAGE_SIZE; 3083 3084 if (temp > UINT16_MAX) 3085 temp = UINT16_MAX; 3086 } 3087 mp->mnt_segreadcnt = (u_int16_t)temp; 3088 3089 if (writesegcnt) { 3090 temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt; 3091 } else { 3092 temp = mp->mnt_maxwritecnt / PAGE_SIZE; 3093 3094 if (temp > UINT16_MAX) 3095 temp = UINT16_MAX; 3096 } 3097 mp->mnt_segwritecnt = (u_int16_t)temp; 3098 3099 if (readsegsize) 3100 temp = (readsegsize > UINT32_MAX) ? UINT32_MAX : readsegsize; 3101 else 3102 temp = mp->mnt_maxreadcnt; 3103 mp->mnt_maxsegreadsize = (u_int32_t)temp; 3104 3105 if (writesegsize) 3106 temp = (writesegsize > UINT32_MAX) ? UINT32_MAX : writesegsize; 3107 else 3108 temp = mp->mnt_maxwritecnt; 3109 mp->mnt_maxsegwritesize = (u_int32_t)temp; 3110 3111 if (alignment) 3112 temp = (alignment > PAGE_SIZE) ? PAGE_MASK : alignment - 1; 3113 else 3114 temp = 0; 3115 mp->mnt_alignmentmask = temp; 3116 3117 3118 if (ioqueue_depth > MNT_DEFAULT_IOQUEUE_DEPTH) 3119 temp = ioqueue_depth; 3120 else 3121 temp = MNT_DEFAULT_IOQUEUE_DEPTH; 3122 3123 mp->mnt_ioqueue_depth = temp; 3124 mp->mnt_ioscale = (mp->mnt_ioqueue_depth + (MNT_DEFAULT_IOQUEUE_DEPTH - 1)) / MNT_DEFAULT_IOQUEUE_DEPTH; 3125 3126 if (mp->mnt_ioscale > 1) 3127 printf("ioqueue_depth = %d, ioscale = %d\n", (int)mp->mnt_ioqueue_depth, (int)mp->mnt_ioscale); 3128 3129 if (features & DK_FEATURE_FORCE_UNIT_ACCESS) 3130 mp->mnt_ioflags |= MNT_IOFLAGS_FUA_SUPPORTED; 3131 3132 if (features & DK_FEATURE_UNMAP) 3133 mp->mnt_ioflags |= MNT_IOFLAGS_UNMAP_SUPPORTED; 3134 3135 return (error); 3136} 3137 3138static struct klist fs_klist; 3139lck_grp_t *fs_klist_lck_grp; 3140lck_mtx_t *fs_klist_lock; 3141 3142void 3143vfs_event_init(void) 3144{ 3145 3146 klist_init(&fs_klist); 3147 fs_klist_lck_grp = lck_grp_alloc_init("fs_klist", NULL); 3148 fs_klist_lock = lck_mtx_alloc_init(fs_klist_lck_grp, NULL); 3149} 3150 3151void 3152vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data) 3153{ 3154 if (event == VQ_DEAD || event == VQ_NOTRESP) { 3155 struct mount *mp = vfs_getvfs(fsid); 3156 if (mp) { 3157 mount_lock_spin(mp); 3158 if (data) 3159 mp->mnt_kern_flag &= ~MNT_LNOTRESP; // Now responding 3160 else 3161 mp->mnt_kern_flag |= MNT_LNOTRESP; // Not responding 3162 mount_unlock(mp); 3163 } 3164 } 3165 3166 lck_mtx_lock(fs_klist_lock); 3167 KNOTE(&fs_klist, event); 3168 lck_mtx_unlock(fs_klist_lock); 3169} 3170 3171/* 3172 * return the number of mounted filesystems. 3173 */ 3174static int 3175sysctl_vfs_getvfscnt(void) 3176{ 3177 return(mount_getvfscnt()); 3178} 3179 3180 3181static int 3182mount_getvfscnt(void) 3183{ 3184 int ret; 3185 3186 mount_list_lock(); 3187 ret = nummounts; 3188 mount_list_unlock(); 3189 return (ret); 3190 3191} 3192 3193 3194 3195static int 3196mount_fillfsids(fsid_t *fsidlst, int count) 3197{ 3198 struct mount *mp; 3199 int actual=0; 3200 3201 actual = 0; 3202 mount_list_lock(); 3203 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3204 if (actual <= count) { 3205 fsidlst[actual] = mp->mnt_vfsstat.f_fsid; 3206 actual++; 3207 } 3208 } 3209 mount_list_unlock(); 3210 return (actual); 3211 3212} 3213 3214/* 3215 * fill in the array of fsid_t's up to a max of 'count', the actual 3216 * number filled in will be set in '*actual'. If there are more fsid_t's 3217 * than room in fsidlst then ENOMEM will be returned and '*actual' will 3218 * have the actual count. 3219 * having *actual filled out even in the error case is depended upon. 3220 */ 3221static int 3222sysctl_vfs_getvfslist(fsid_t *fsidlst, int count, int *actual) 3223{ 3224 struct mount *mp; 3225 3226 *actual = 0; 3227 mount_list_lock(); 3228 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3229 (*actual)++; 3230 if (*actual <= count) 3231 fsidlst[(*actual) - 1] = mp->mnt_vfsstat.f_fsid; 3232 } 3233 mount_list_unlock(); 3234 return (*actual <= count ? 0 : ENOMEM); 3235} 3236 3237static int 3238sysctl_vfs_vfslist(__unused struct sysctl_oid *oidp, __unused void *arg1, 3239 __unused int arg2, struct sysctl_req *req) 3240{ 3241 int actual, error; 3242 size_t space; 3243 fsid_t *fsidlst; 3244 3245 /* This is a readonly node. */ 3246 if (req->newptr != USER_ADDR_NULL) 3247 return (EPERM); 3248 3249 /* they are querying us so just return the space required. */ 3250 if (req->oldptr == USER_ADDR_NULL) { 3251 req->oldidx = sysctl_vfs_getvfscnt() * sizeof(fsid_t); 3252 return 0; 3253 } 3254again: 3255 /* 3256 * Retrieve an accurate count of the amount of space required to copy 3257 * out all the fsids in the system. 3258 */ 3259 space = req->oldlen; 3260 req->oldlen = sysctl_vfs_getvfscnt() * sizeof(fsid_t); 3261 3262 /* they didn't give us enough space. */ 3263 if (space < req->oldlen) 3264 return (ENOMEM); 3265 3266 MALLOC(fsidlst, fsid_t *, req->oldlen, M_TEMP, M_WAITOK); 3267 if (fsidlst == NULL) { 3268 return (ENOMEM); 3269 } 3270 3271 error = sysctl_vfs_getvfslist(fsidlst, req->oldlen / sizeof(fsid_t), 3272 &actual); 3273 /* 3274 * If we get back ENOMEM, then another mount has been added while we 3275 * slept in malloc above. If this is the case then try again. 3276 */ 3277 if (error == ENOMEM) { 3278 FREE(fsidlst, M_TEMP); 3279 req->oldlen = space; 3280 goto again; 3281 } 3282 if (error == 0) { 3283 error = SYSCTL_OUT(req, fsidlst, actual * sizeof(fsid_t)); 3284 } 3285 FREE(fsidlst, M_TEMP); 3286 return (error); 3287} 3288 3289/* 3290 * Do a sysctl by fsid. 3291 */ 3292static int 3293sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid *oidp, void *arg1, int arg2, 3294 struct sysctl_req *req) 3295{ 3296 union union_vfsidctl vc; 3297 struct mount *mp; 3298 struct vfsstatfs *sp; 3299 int *name, flags, namelen; 3300 int error=0, gotref=0; 3301 vfs_context_t ctx = vfs_context_current(); 3302 proc_t p = req->p; /* XXX req->p != current_proc()? */ 3303 boolean_t is_64_bit; 3304 3305 name = arg1; 3306 namelen = arg2; 3307 is_64_bit = proc_is64bit(p); 3308 3309 error = SYSCTL_IN(req, &vc, is_64_bit? sizeof(vc.vc64):sizeof(vc.vc32)); 3310 if (error) 3311 goto out; 3312 if (vc.vc32.vc_vers != VFS_CTL_VERS1) { /* works for 32 and 64 */ 3313 error = EINVAL; 3314 goto out; 3315 } 3316 mp = mount_list_lookupby_fsid(&vc.vc32.vc_fsid, 0, 1); /* works for 32 and 64 */ 3317 if (mp == NULL) { 3318 error = ENOENT; 3319 goto out; 3320 } 3321 gotref = 1; 3322 /* reset so that the fs specific code can fetch it. */ 3323 req->newidx = 0; 3324 /* 3325 * Note if this is a VFS_CTL then we pass the actual sysctl req 3326 * in for "oldp" so that the lower layer can DTRT and use the 3327 * SYSCTL_IN/OUT routines. 3328 */ 3329 if (mp->mnt_op->vfs_sysctl != NULL) { 3330 if (is_64_bit) { 3331 if (vfs_64bitready(mp)) { 3332 error = mp->mnt_op->vfs_sysctl(name, namelen, 3333 CAST_USER_ADDR_T(req), 3334 NULL, USER_ADDR_NULL, 0, 3335 ctx); 3336 } 3337 else { 3338 error = ENOTSUP; 3339 } 3340 } 3341 else { 3342 error = mp->mnt_op->vfs_sysctl(name, namelen, 3343 CAST_USER_ADDR_T(req), 3344 NULL, USER_ADDR_NULL, 0, 3345 ctx); 3346 } 3347 if (error != ENOTSUP) { 3348 goto out; 3349 } 3350 } 3351 switch (name[0]) { 3352 case VFS_CTL_UMOUNT: 3353 req->newidx = 0; 3354 if (is_64_bit) { 3355 req->newptr = vc.vc64.vc_ptr; 3356 req->newlen = (size_t)vc.vc64.vc_len; 3357 } 3358 else { 3359 req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr); 3360 req->newlen = vc.vc32.vc_len; 3361 } 3362 error = SYSCTL_IN(req, &flags, sizeof(flags)); 3363 if (error) 3364 break; 3365 3366 mount_ref(mp, 0); 3367 mount_iterdrop(mp); 3368 gotref = 0; 3369 /* safedounmount consumes a ref */ 3370 error = safedounmount(mp, flags, ctx); 3371 break; 3372 case VFS_CTL_STATFS: 3373 req->newidx = 0; 3374 if (is_64_bit) { 3375 req->newptr = vc.vc64.vc_ptr; 3376 req->newlen = (size_t)vc.vc64.vc_len; 3377 } 3378 else { 3379 req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr); 3380 req->newlen = vc.vc32.vc_len; 3381 } 3382 error = SYSCTL_IN(req, &flags, sizeof(flags)); 3383 if (error) 3384 break; 3385 sp = &mp->mnt_vfsstat; 3386 if (((flags & MNT_NOWAIT) == 0 || (flags & (MNT_WAIT | MNT_DWAIT))) && 3387 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT))) 3388 goto out; 3389 if (is_64_bit) { 3390 struct user64_statfs sfs; 3391 bzero(&sfs, sizeof(sfs)); 3392 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK; 3393 sfs.f_type = mp->mnt_vtable->vfc_typenum; 3394 sfs.f_bsize = (user64_long_t)sp->f_bsize; 3395 sfs.f_iosize = (user64_long_t)sp->f_iosize; 3396 sfs.f_blocks = (user64_long_t)sp->f_blocks; 3397 sfs.f_bfree = (user64_long_t)sp->f_bfree; 3398 sfs.f_bavail = (user64_long_t)sp->f_bavail; 3399 sfs.f_files = (user64_long_t)sp->f_files; 3400 sfs.f_ffree = (user64_long_t)sp->f_ffree; 3401 sfs.f_fsid = sp->f_fsid; 3402 sfs.f_owner = sp->f_owner; 3403 3404 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { 3405 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN); 3406 } else { 3407 strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN); 3408 } 3409 strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN); 3410 strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN); 3411 3412 error = SYSCTL_OUT(req, &sfs, sizeof(sfs)); 3413 } 3414 else { 3415 struct user32_statfs sfs; 3416 bzero(&sfs, sizeof(sfs)); 3417 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK; 3418 sfs.f_type = mp->mnt_vtable->vfc_typenum; 3419 3420 /* 3421 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we 3422 * have to fudge the numbers here in that case. We inflate the blocksize in order 3423 * to reflect the filesystem size as best we can. 3424 */ 3425 if (sp->f_blocks > INT_MAX) { 3426 int shift; 3427 3428 /* 3429 * Work out how far we have to shift the block count down to make it fit. 3430 * Note that it's possible to have to shift so far that the resulting 3431 * blocksize would be unreportably large. At that point, we will clip 3432 * any values that don't fit. 3433 * 3434 * For safety's sake, we also ensure that f_iosize is never reported as 3435 * being smaller than f_bsize. 3436 */ 3437 for (shift = 0; shift < 32; shift++) { 3438 if ((sp->f_blocks >> shift) <= INT_MAX) 3439 break; 3440 if ((((long long)sp->f_bsize) << (shift + 1)) > INT_MAX) 3441 break; 3442 } 3443#define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s))) 3444 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sp->f_blocks, shift); 3445 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bfree, shift); 3446 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bavail, shift); 3447#undef __SHIFT_OR_CLIP 3448 sfs.f_bsize = (user32_long_t)(sp->f_bsize << shift); 3449 sfs.f_iosize = lmax(sp->f_iosize, sp->f_bsize); 3450 } else { 3451 sfs.f_bsize = (user32_long_t)sp->f_bsize; 3452 sfs.f_iosize = (user32_long_t)sp->f_iosize; 3453 sfs.f_blocks = (user32_long_t)sp->f_blocks; 3454 sfs.f_bfree = (user32_long_t)sp->f_bfree; 3455 sfs.f_bavail = (user32_long_t)sp->f_bavail; 3456 } 3457 sfs.f_files = (user32_long_t)sp->f_files; 3458 sfs.f_ffree = (user32_long_t)sp->f_ffree; 3459 sfs.f_fsid = sp->f_fsid; 3460 sfs.f_owner = sp->f_owner; 3461 3462 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { 3463 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN); 3464 } else { 3465 strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN); 3466 } 3467 strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN); 3468 strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN); 3469 3470 error = SYSCTL_OUT(req, &sfs, sizeof(sfs)); 3471 } 3472 break; 3473 default: 3474 error = ENOTSUP; 3475 goto out; 3476 } 3477out: 3478 if(gotref != 0) 3479 mount_iterdrop(mp); 3480 return (error); 3481} 3482 3483static int filt_fsattach(struct knote *kn); 3484static void filt_fsdetach(struct knote *kn); 3485static int filt_fsevent(struct knote *kn, long hint); 3486struct filterops fs_filtops = { 3487 .f_attach = filt_fsattach, 3488 .f_detach = filt_fsdetach, 3489 .f_event = filt_fsevent, 3490}; 3491 3492static int 3493filt_fsattach(struct knote *kn) 3494{ 3495 3496 lck_mtx_lock(fs_klist_lock); 3497 kn->kn_flags |= EV_CLEAR; 3498 KNOTE_ATTACH(&fs_klist, kn); 3499 lck_mtx_unlock(fs_klist_lock); 3500 return (0); 3501} 3502 3503static void 3504filt_fsdetach(struct knote *kn) 3505{ 3506 lck_mtx_lock(fs_klist_lock); 3507 KNOTE_DETACH(&fs_klist, kn); 3508 lck_mtx_unlock(fs_klist_lock); 3509} 3510 3511static int 3512filt_fsevent(struct knote *kn, long hint) 3513{ 3514 /* 3515 * Backwards compatibility: 3516 * Other filters would do nothing if kn->kn_sfflags == 0 3517 */ 3518 3519 if ((kn->kn_sfflags == 0) || (kn->kn_sfflags & hint)) { 3520 kn->kn_fflags |= hint; 3521 } 3522 3523 return (kn->kn_fflags != 0); 3524} 3525 3526static int 3527sysctl_vfs_noremotehang(__unused struct sysctl_oid *oidp, 3528 __unused void *arg1, __unused int arg2, struct sysctl_req *req) 3529{ 3530 int out, error; 3531 pid_t pid; 3532 proc_t p; 3533 3534 /* We need a pid. */ 3535 if (req->newptr == USER_ADDR_NULL) 3536 return (EINVAL); 3537 3538 error = SYSCTL_IN(req, &pid, sizeof(pid)); 3539 if (error) 3540 return (error); 3541 3542 p = proc_find(pid < 0 ? -pid : pid); 3543 if (p == NULL) 3544 return (ESRCH); 3545 3546 /* 3547 * Fetching the value is ok, but we only fetch if the old 3548 * pointer is given. 3549 */ 3550 if (req->oldptr != USER_ADDR_NULL) { 3551 out = !((p->p_flag & P_NOREMOTEHANG) == 0); 3552 proc_rele(p); 3553 error = SYSCTL_OUT(req, &out, sizeof(out)); 3554 return (error); 3555 } 3556 3557 /* cansignal offers us enough security. */ 3558 if (p != req->p && proc_suser(req->p) != 0) { 3559 proc_rele(p); 3560 return (EPERM); 3561 } 3562 3563 if (pid < 0) 3564 OSBitAndAtomic(~((uint32_t)P_NOREMOTEHANG), &p->p_flag); 3565 else 3566 OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag); 3567 proc_rele(p); 3568 3569 return (0); 3570} 3571 3572/* the vfs.generic. branch. */ 3573SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs generic hinge"); 3574/* retreive a list of mounted filesystem fsid_t */ 3575SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist, CTLFLAG_RD | CTLFLAG_LOCKED, 3576 NULL, 0, sysctl_vfs_vfslist, "S,fsid", "List of mounted filesystem ids"); 3577/* perform operations on filesystem via fsid_t */ 3578SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW | CTLFLAG_LOCKED, 3579 sysctl_vfs_ctlbyfsid, "ctlbyfsid"); 3580SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW | CTLFLAG_ANYBODY, 3581 NULL, 0, sysctl_vfs_noremotehang, "I", "noremotehang"); 3582 3583 3584long num_reusedvnodes = 0; 3585 3586 3587static vnode_t 3588process_vp(vnode_t vp, int want_vp, int *deferred) 3589{ 3590 unsigned int vpid; 3591 3592 *deferred = 0; 3593 3594 vpid = vp->v_id; 3595 3596 vnode_list_remove_locked(vp); 3597 3598 vnode_list_unlock(); 3599 3600 vnode_lock_spin(vp); 3601 3602 /* 3603 * We could wait for the vnode_lock after removing the vp from the freelist 3604 * and the vid is bumped only at the very end of reclaim. So it is possible 3605 * that we are looking at a vnode that is being terminated. If so skip it. 3606 */ 3607 if ((vpid != vp->v_id) || (vp->v_usecount != 0) || (vp->v_iocount != 0) || 3608 VONLIST(vp) || (vp->v_lflag & VL_TERMINATE)) { 3609 /* 3610 * we lost the race between dropping the list lock 3611 * and picking up the vnode_lock... someone else 3612 * used this vnode and it is now in a new state 3613 */ 3614 vnode_unlock(vp); 3615 3616 return (NULLVP); 3617 } 3618 if ( (vp->v_lflag & (VL_NEEDINACTIVE | VL_MARKTERM)) == VL_NEEDINACTIVE ) { 3619 /* 3620 * we did a vnode_rele_ext that asked for 3621 * us not to reenter the filesystem during 3622 * the release even though VL_NEEDINACTIVE was 3623 * set... we'll do it here by doing a 3624 * vnode_get/vnode_put 3625 * 3626 * pick up an iocount so that we can call 3627 * vnode_put and drive the VNOP_INACTIVE... 3628 * vnode_put will either leave us off 3629 * the freelist if a new ref comes in, 3630 * or put us back on the end of the freelist 3631 * or recycle us if we were marked for termination... 3632 * so we'll just go grab a new candidate 3633 */ 3634 vp->v_iocount++; 3635#ifdef JOE_DEBUG 3636 record_vp(vp, 1); 3637#endif 3638 vnode_put_locked(vp); 3639 vnode_unlock(vp); 3640 3641 return (NULLVP); 3642 } 3643 /* 3644 * Checks for anyone racing us for recycle 3645 */ 3646 if (vp->v_type != VBAD) { 3647 if (want_vp && vnode_on_reliable_media(vp) == FALSE) { 3648 vnode_async_list_add(vp); 3649 vnode_unlock(vp); 3650 3651 *deferred = 1; 3652 3653 return (NULLVP); 3654 } 3655 if (vp->v_lflag & VL_DEAD) 3656 panic("new_vnode(%p): the vnode is VL_DEAD but not VBAD", vp); 3657 3658 vnode_lock_convert(vp); 3659 (void)vnode_reclaim_internal(vp, 1, want_vp, 0); 3660 3661 if (want_vp) { 3662 if ((VONLIST(vp))) 3663 panic("new_vnode(%p): vp on list", vp); 3664 if (vp->v_usecount || vp->v_iocount || vp->v_kusecount || 3665 (vp->v_lflag & (VNAMED_UBC | VNAMED_MOUNT | VNAMED_FSHASH))) 3666 panic("new_vnode(%p): free vnode still referenced", vp); 3667 if ((vp->v_mntvnodes.tqe_prev != 0) && (vp->v_mntvnodes.tqe_next != 0)) 3668 panic("new_vnode(%p): vnode seems to be on mount list", vp); 3669 if ( !LIST_EMPTY(&vp->v_nclinks) || !LIST_EMPTY(&vp->v_ncchildren)) 3670 panic("new_vnode(%p): vnode still hooked into the name cache", vp); 3671 } else { 3672 vnode_unlock(vp); 3673 vp = NULLVP; 3674 } 3675 } 3676 return (vp); 3677} 3678 3679 3680 3681static void 3682async_work_continue(void) 3683{ 3684 struct async_work_lst *q; 3685 int deferred; 3686 vnode_t vp; 3687 3688 q = &vnode_async_work_list; 3689 3690 for (;;) { 3691 3692 vnode_list_lock(); 3693 3694 if ( TAILQ_EMPTY(q) ) { 3695 assert_wait(q, (THREAD_UNINT)); 3696 3697 vnode_list_unlock(); 3698 3699 thread_block((thread_continue_t)async_work_continue); 3700 3701 continue; 3702 } 3703 async_work_handled++; 3704 3705 vp = TAILQ_FIRST(q); 3706 3707 vp = process_vp(vp, 0, &deferred); 3708 3709 if (vp != NULLVP) 3710 panic("found VBAD vp (%p) on async queue", vp); 3711 } 3712} 3713 3714 3715static int 3716new_vnode(vnode_t *vpp) 3717{ 3718 vnode_t vp; 3719 uint32_t retries = 0, max_retries = 100; /* retry incase of tablefull */ 3720 int force_alloc = 0, walk_count = 0; 3721 boolean_t need_reliable_vp = FALSE; 3722 int deferred; 3723 struct timeval initial_tv; 3724 struct timeval current_tv; 3725 proc_t curproc = current_proc(); 3726 3727 initial_tv.tv_sec = 0; 3728retry: 3729 vp = NULLVP; 3730 3731 vnode_list_lock(); 3732 3733 if (need_reliable_vp == TRUE) 3734 async_work_timed_out++; 3735 3736 if ((numvnodes - deadvnodes) < desiredvnodes || force_alloc) { 3737 struct timespec ts; 3738 3739 if ( !TAILQ_EMPTY(&vnode_dead_list)) { 3740 /* 3741 * Can always reuse a dead one 3742 */ 3743 vp = TAILQ_FIRST(&vnode_dead_list); 3744 goto steal_this_vp; 3745 } 3746 /* 3747 * no dead vnodes available... if we're under 3748 * the limit, we'll create a new vnode 3749 */ 3750 numvnodes++; 3751 vnode_list_unlock(); 3752 3753 MALLOC_ZONE(vp, struct vnode *, sizeof(*vp), M_VNODE, M_WAITOK); 3754 bzero((char *)vp, sizeof(*vp)); 3755 VLISTNONE(vp); /* avoid double queue removal */ 3756 lck_mtx_init(&vp->v_lock, vnode_lck_grp, vnode_lck_attr); 3757 3758 klist_init(&vp->v_knotes); 3759 nanouptime(&ts); 3760 vp->v_id = ts.tv_nsec; 3761 vp->v_flag = VSTANDARD; 3762 3763#if CONFIG_MACF 3764 if (mac_vnode_label_init_needed(vp)) 3765 mac_vnode_label_init(vp); 3766#endif /* MAC */ 3767 3768 vp->v_iocount = 1; 3769 goto done; 3770 } 3771 microuptime(¤t_tv); 3772 3773#define MAX_WALK_COUNT 1000 3774 3775 if ( !TAILQ_EMPTY(&vnode_rage_list) && 3776 (ragevnodes >= rage_limit || 3777 (current_tv.tv_sec - rage_tv.tv_sec) >= RAGE_TIME_LIMIT)) { 3778 3779 TAILQ_FOREACH(vp, &vnode_rage_list, v_freelist) { 3780 if ( !(vp->v_listflag & VLIST_RAGE)) 3781 panic("new_vnode: vp (%p) on RAGE list not marked VLIST_RAGE", vp); 3782 3783 // if we're a dependency-capable process, skip vnodes that can 3784 // cause recycling deadlocks. (i.e. this process is diskimages 3785 // helper and the vnode is in a disk image). Querying the 3786 // mnt_kern_flag for the mount's virtual device status 3787 // is safer than checking the mnt_dependent_process, which 3788 // may not be updated if there are multiple devnode layers 3789 // in between the disk image and the final consumer. 3790 3791 if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || 3792 (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) { 3793 /* 3794 * if need_reliable_vp == TRUE, then we've already sent one or more 3795 * non-reliable vnodes to the async thread for processing and timed 3796 * out waiting for a dead vnode to show up. Use the MAX_WALK_COUNT 3797 * mechanism to first scan for a reliable vnode before forcing 3798 * a new vnode to be created 3799 */ 3800 if (need_reliable_vp == FALSE || vnode_on_reliable_media(vp) == TRUE) 3801 break; 3802 } 3803 3804 // don't iterate more than MAX_WALK_COUNT vnodes to 3805 // avoid keeping the vnode list lock held for too long. 3806 3807 if (walk_count++ > MAX_WALK_COUNT) { 3808 vp = NULL; 3809 break; 3810 } 3811 } 3812 } 3813 3814 if (vp == NULL && !TAILQ_EMPTY(&vnode_free_list)) { 3815 /* 3816 * Pick the first vp for possible reuse 3817 */ 3818 walk_count = 0; 3819 TAILQ_FOREACH(vp, &vnode_free_list, v_freelist) { 3820 3821 // if we're a dependency-capable process, skip vnodes that can 3822 // cause recycling deadlocks. (i.e. this process is diskimages 3823 // helper and the vnode is in a disk image). Querying the 3824 // mnt_kern_flag for the mount's virtual device status 3825 // is safer than checking the mnt_dependent_process, which 3826 // may not be updated if there are multiple devnode layers 3827 // in between the disk image and the final consumer. 3828 3829 if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || 3830 (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) { 3831 /* 3832 * if need_reliable_vp == TRUE, then we've already sent one or more 3833 * non-reliable vnodes to the async thread for processing and timed 3834 * out waiting for a dead vnode to show up. Use the MAX_WALK_COUNT 3835 * mechanism to first scan for a reliable vnode before forcing 3836 * a new vnode to be created 3837 */ 3838 if (need_reliable_vp == FALSE || vnode_on_reliable_media(vp) == TRUE) 3839 break; 3840 } 3841 3842 // don't iterate more than MAX_WALK_COUNT vnodes to 3843 // avoid keeping the vnode list lock held for too long. 3844 3845 if (walk_count++ > MAX_WALK_COUNT) { 3846 vp = NULL; 3847 break; 3848 } 3849 } 3850 } 3851 3852 // 3853 // if we don't have a vnode and the walk_count is >= MAX_WALK_COUNT 3854 // then we're trying to create a vnode on behalf of a 3855 // process like diskimages-helper that has file systems 3856 // mounted on top of itself (and thus we can't reclaim 3857 // vnodes in the file systems on top of us). if we can't 3858 // find a vnode to reclaim then we'll just have to force 3859 // the allocation. 3860 // 3861 if (vp == NULL && walk_count >= MAX_WALK_COUNT) { 3862 force_alloc = 1; 3863 vnode_list_unlock(); 3864 goto retry; 3865 } 3866 3867 if (vp == NULL) { 3868 /* 3869 * we've reached the system imposed maximum number of vnodes 3870 * but there isn't a single one available 3871 * wait a bit and then retry... if we can't get a vnode 3872 * after our target number of retries, than log a complaint 3873 */ 3874 if (++retries <= max_retries) { 3875 vnode_list_unlock(); 3876 delay_for_interval(1, 1000 * 1000); 3877 goto retry; 3878 } 3879 3880 vnode_list_unlock(); 3881 tablefull("vnode"); 3882 log(LOG_EMERG, "%d desired, %d numvnodes, " 3883 "%d free, %d dead, %d rage\n", 3884 desiredvnodes, numvnodes, freevnodes, deadvnodes, ragevnodes); 3885#if CONFIG_JETSAM 3886 /* 3887 * Running out of vnodes tends to make a system unusable. Start killing 3888 * processes that jetsam knows are killable. 3889 */ 3890 if (memorystatus_kill_on_vnode_limit() == FALSE) { 3891 /* 3892 * If jetsam can't find any more processes to kill and there 3893 * still aren't any free vnodes, panic. Hopefully we'll get a 3894 * panic log to tell us why we ran out. 3895 */ 3896 panic("vnode table is full\n"); 3897 } 3898 3899 /* 3900 * Now that we've killed someone, wait a bit and continue looking 3901 * (with fewer retries before trying another kill). 3902 */ 3903 delay_for_interval(3, 1000 * 1000); 3904 retries = 0; 3905 max_retries = 10; 3906 goto retry; 3907#endif 3908 3909 *vpp = NULL; 3910 return (ENFILE); 3911 } 3912steal_this_vp: 3913 if ((vp = process_vp(vp, 1, &deferred)) == NULLVP) { 3914 if (deferred) { 3915 int elapsed_msecs; 3916 struct timeval elapsed_tv; 3917 3918 if (initial_tv.tv_sec == 0) 3919 microuptime(&initial_tv); 3920 3921 vnode_list_lock(); 3922 3923 dead_vnode_waited++; 3924 dead_vnode_wanted++; 3925 3926 /* 3927 * note that we're only going to explicitly wait 10ms 3928 * for a dead vnode to become available, since even if one 3929 * isn't available, a reliable vnode might now be available 3930 * at the head of the VRAGE or free lists... if so, we 3931 * can satisfy the new_vnode request with less latency then waiting 3932 * for the full 100ms duration we're ultimately willing to tolerate 3933 */ 3934 assert_wait_timeout((caddr_t)&dead_vnode_wanted, (THREAD_INTERRUPTIBLE), 10000, NSEC_PER_USEC); 3935 3936 vnode_list_unlock(); 3937 3938 thread_block(THREAD_CONTINUE_NULL); 3939 3940 microuptime(&elapsed_tv); 3941 3942 timevalsub(&elapsed_tv, &initial_tv); 3943 elapsed_msecs = elapsed_tv.tv_sec * 1000 + elapsed_tv.tv_usec / 1000; 3944 3945 if (elapsed_msecs >= 100) { 3946 /* 3947 * we've waited long enough... 100ms is 3948 * somewhat arbitrary for this case, but the 3949 * normal worst case latency used for UI 3950 * interaction is 100ms, so I've chosen to 3951 * go with that. 3952 * 3953 * setting need_reliable_vp to TRUE 3954 * forces us to find a reliable vnode 3955 * that we can process synchronously, or 3956 * to create a new one if the scan for 3957 * a reliable one hits the scan limit 3958 */ 3959 need_reliable_vp = TRUE; 3960 } 3961 } 3962 goto retry; 3963 } 3964 OSAddAtomicLong(1, &num_reusedvnodes); 3965 3966 3967#if CONFIG_MACF 3968 /* 3969 * We should never see VL_LABELWAIT or VL_LABEL here. 3970 * as those operations hold a reference. 3971 */ 3972 assert ((vp->v_lflag & VL_LABELWAIT) != VL_LABELWAIT); 3973 assert ((vp->v_lflag & VL_LABEL) != VL_LABEL); 3974 if (vp->v_lflag & VL_LABELED) { 3975 vnode_lock_convert(vp); 3976 mac_vnode_label_recycle(vp); 3977 } else if (mac_vnode_label_init_needed(vp)) { 3978 vnode_lock_convert(vp); 3979 mac_vnode_label_init(vp); 3980 } 3981 3982#endif /* MAC */ 3983 3984 vp->v_iocount = 1; 3985 vp->v_lflag = 0; 3986 vp->v_writecount = 0; 3987 vp->v_references = 0; 3988 vp->v_iterblkflags = 0; 3989 vp->v_flag = VSTANDARD; 3990 /* vbad vnodes can point to dead_mountp */ 3991 vp->v_mount = NULL; 3992 vp->v_defer_reclaimlist = (vnode_t)0; 3993 3994 vnode_unlock(vp); 3995 3996done: 3997 *vpp = vp; 3998 3999 return (0); 4000} 4001 4002void 4003vnode_lock(vnode_t vp) 4004{ 4005 lck_mtx_lock(&vp->v_lock); 4006} 4007 4008void 4009vnode_lock_spin(vnode_t vp) 4010{ 4011 lck_mtx_lock_spin(&vp->v_lock); 4012} 4013 4014void 4015vnode_unlock(vnode_t vp) 4016{ 4017 lck_mtx_unlock(&vp->v_lock); 4018} 4019 4020 4021 4022int 4023vnode_get(struct vnode *vp) 4024{ 4025 int retval; 4026 4027 vnode_lock_spin(vp); 4028 retval = vnode_get_locked(vp); 4029 vnode_unlock(vp); 4030 4031 return(retval); 4032} 4033 4034int 4035vnode_get_locked(struct vnode *vp) 4036{ 4037#if DIAGNOSTIC 4038 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); 4039#endif 4040 if ((vp->v_iocount == 0) && (vp->v_lflag & (VL_TERMINATE | VL_DEAD))) { 4041 return(ENOENT); 4042 } 4043 vp->v_iocount++; 4044#ifdef JOE_DEBUG 4045 record_vp(vp, 1); 4046#endif 4047 return (0); 4048} 4049 4050/* 4051 * vnode_getwithvid() cuts in line in front of a vnode drain (that is, 4052 * while the vnode is draining, but at no point after that) to prevent 4053 * deadlocks when getting vnodes from filesystem hashes while holding 4054 * resources that may prevent other iocounts from being released. 4055 */ 4056int 4057vnode_getwithvid(vnode_t vp, uint32_t vid) 4058{ 4059 return(vget_internal(vp, vid, ( VNODE_NODEAD | VNODE_WITHID | VNODE_DRAINO ))); 4060} 4061 4062/* 4063 * vnode_getwithvid_drainok() is like vnode_getwithvid(), but *does* block behind a vnode 4064 * drain; it exists for use in the VFS name cache, where we really do want to block behind 4065 * vnode drain to prevent holding off an unmount. 4066 */ 4067int 4068vnode_getwithvid_drainok(vnode_t vp, uint32_t vid) 4069{ 4070 return(vget_internal(vp, vid, ( VNODE_NODEAD | VNODE_WITHID ))); 4071} 4072 4073int 4074vnode_getwithref(vnode_t vp) 4075{ 4076 return(vget_internal(vp, 0, 0)); 4077} 4078 4079 4080__private_extern__ int 4081vnode_getalways(vnode_t vp) 4082{ 4083 return(vget_internal(vp, 0, VNODE_ALWAYS)); 4084} 4085 4086int 4087vnode_put(vnode_t vp) 4088{ 4089 int retval; 4090 4091 vnode_lock_spin(vp); 4092 retval = vnode_put_locked(vp); 4093 vnode_unlock(vp); 4094 4095 return(retval); 4096} 4097 4098int 4099vnode_put_locked(vnode_t vp) 4100{ 4101 vfs_context_t ctx = vfs_context_current(); /* hoist outside loop */ 4102 4103#if DIAGNOSTIC 4104 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); 4105#endif 4106retry: 4107 if (vp->v_iocount < 1) 4108 panic("vnode_put(%p): iocount < 1", vp); 4109 4110 if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) { 4111 vnode_dropiocount(vp); 4112 return(0); 4113 } 4114 if ((vp->v_lflag & (VL_DEAD | VL_NEEDINACTIVE)) == VL_NEEDINACTIVE) { 4115 4116 vp->v_lflag &= ~VL_NEEDINACTIVE; 4117 vnode_unlock(vp); 4118 4119 VNOP_INACTIVE(vp, ctx); 4120 4121 vnode_lock_spin(vp); 4122 /* 4123 * because we had to drop the vnode lock before calling 4124 * VNOP_INACTIVE, the state of this vnode may have changed... 4125 * we may pick up both VL_MARTERM and either 4126 * an iocount or a usecount while in the VNOP_INACTIVE call 4127 * we don't want to call vnode_reclaim_internal on a vnode 4128 * that has active references on it... so loop back around 4129 * and reevaluate the state 4130 */ 4131 goto retry; 4132 } 4133 vp->v_lflag &= ~VL_NEEDINACTIVE; 4134 4135 if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM) { 4136 vnode_lock_convert(vp); 4137 vnode_reclaim_internal(vp, 1, 1, 0); 4138 } 4139 vnode_dropiocount(vp); 4140 vnode_list_add(vp); 4141 4142 return(0); 4143} 4144 4145/* is vnode_t in use by others? */ 4146int 4147vnode_isinuse(vnode_t vp, int refcnt) 4148{ 4149 return(vnode_isinuse_locked(vp, refcnt, 0)); 4150} 4151 4152 4153static int 4154vnode_isinuse_locked(vnode_t vp, int refcnt, int locked) 4155{ 4156 int retval = 0; 4157 4158 if (!locked) 4159 vnode_lock_spin(vp); 4160 if ((vp->v_type != VREG) && ((vp->v_usecount - vp->v_kusecount) > refcnt)) { 4161 retval = 1; 4162 goto out; 4163 } 4164 if (vp->v_type == VREG) { 4165 retval = ubc_isinuse_locked(vp, refcnt, 1); 4166 } 4167 4168out: 4169 if (!locked) 4170 vnode_unlock(vp); 4171 return(retval); 4172} 4173 4174 4175/* resume vnode_t */ 4176errno_t 4177vnode_resume(vnode_t vp) 4178{ 4179 if ((vp->v_lflag & VL_SUSPENDED) && vp->v_owner == current_thread()) { 4180 4181 vnode_lock_spin(vp); 4182 vp->v_lflag &= ~VL_SUSPENDED; 4183 vp->v_owner = NULL; 4184 vnode_unlock(vp); 4185 4186 wakeup(&vp->v_iocount); 4187 } 4188 return(0); 4189} 4190 4191/* suspend vnode_t 4192 * Please do not use on more than one vnode at a time as it may 4193 * cause deadlocks. 4194 * xxx should we explicity prevent this from happening? 4195 */ 4196 4197errno_t 4198vnode_suspend(vnode_t vp) 4199{ 4200 if (vp->v_lflag & VL_SUSPENDED) { 4201 return(EBUSY); 4202 } 4203 4204 vnode_lock_spin(vp); 4205 4206 /* 4207 * xxx is this sufficient to check if a vnode_drain is 4208 * progress? 4209 */ 4210 4211 if (vp->v_owner == NULL) { 4212 vp->v_lflag |= VL_SUSPENDED; 4213 vp->v_owner = current_thread(); 4214 } 4215 vnode_unlock(vp); 4216 4217 return(0); 4218} 4219 4220/* 4221 * Release any blocked locking requests on the vnode. 4222 * Used for forced-unmounts. 4223 * 4224 * XXX What about network filesystems? 4225 */ 4226static void 4227vnode_abort_advlocks(vnode_t vp) 4228{ 4229 if (vp->v_flag & VLOCKLOCAL) 4230 lf_abort_advlocks(vp); 4231} 4232 4233 4234static errno_t 4235vnode_drain(vnode_t vp) 4236{ 4237 4238 if (vp->v_lflag & VL_DRAIN) { 4239 panic("vnode_drain: recursive drain"); 4240 return(ENOENT); 4241 } 4242 vp->v_lflag |= VL_DRAIN; 4243 vp->v_owner = current_thread(); 4244 4245 while (vp->v_iocount > 1) 4246 msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_drain", NULL); 4247 4248 vp->v_lflag &= ~VL_DRAIN; 4249 4250 return(0); 4251} 4252 4253 4254/* 4255 * if the number of recent references via vnode_getwithvid or vnode_getwithref 4256 * exceeds this threshold, than 'UN-AGE' the vnode by removing it from 4257 * the LRU list if it's currently on it... once the iocount and usecount both drop 4258 * to 0, it will get put back on the end of the list, effectively making it younger 4259 * this allows us to keep actively referenced vnodes in the list without having 4260 * to constantly remove and add to the list each time a vnode w/o a usecount is 4261 * referenced which costs us taking and dropping a global lock twice. 4262 */ 4263#define UNAGE_THRESHHOLD 25 4264 4265errno_t 4266vnode_getiocount(vnode_t vp, unsigned int vid, int vflags) 4267{ 4268 int nodead = vflags & VNODE_NODEAD; 4269 int nosusp = vflags & VNODE_NOSUSPEND; 4270 int always = vflags & VNODE_ALWAYS; 4271 int beatdrain = vflags & VNODE_DRAINO; 4272 int withvid = vflags & VNODE_WITHID; 4273 4274 for (;;) { 4275 /* 4276 * if it is a dead vnode with deadfs 4277 */ 4278 if (nodead && (vp->v_lflag & VL_DEAD) && ((vp->v_type == VBAD) || (vp->v_data == 0))) { 4279 return(ENOENT); 4280 } 4281 /* 4282 * will return VL_DEAD ones 4283 */ 4284 if ((vp->v_lflag & (VL_SUSPENDED | VL_DRAIN | VL_TERMINATE)) == 0 ) { 4285 break; 4286 } 4287 /* 4288 * if suspended vnodes are to be failed 4289 */ 4290 if (nosusp && (vp->v_lflag & VL_SUSPENDED)) { 4291 return(ENOENT); 4292 } 4293 /* 4294 * if you are the owner of drain/suspend/termination , can acquire iocount 4295 * check for VL_TERMINATE; it does not set owner 4296 */ 4297 if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED | VL_TERMINATE)) && 4298 (vp->v_owner == current_thread())) { 4299 break; 4300 } 4301 4302 if (always != 0) 4303 break; 4304 4305 /* 4306 * If this vnode is getting drained, there are some cases where 4307 * we can't block. 4308 */ 4309 if (vp->v_lflag & VL_DRAIN) { 4310 /* 4311 * In some situations, we want to get an iocount 4312 * even if the vnode is draining to prevent deadlock, 4313 * e.g. if we're in the filesystem, potentially holding 4314 * resources that could prevent other iocounts from 4315 * being released. 4316 */ 4317 if (beatdrain) 4318 break; 4319 /* 4320 * Don't block if the vnode's mount point is unmounting as 4321 * we may be the thread the unmount is itself waiting on 4322 * Only callers who pass in vids (at this point, we've already 4323 * handled nosusp and nodead) are expecting error returns 4324 * from this function, so only we can only return errors for 4325 * those. ENODEV is intended to inform callers that the call 4326 * failed because an unmount is in progress. 4327 */ 4328 if (withvid && (vp->v_mount) && vfs_isunmount(vp->v_mount)) 4329 return(ENODEV); 4330 } 4331 4332 vnode_lock_convert(vp); 4333 4334 if (vp->v_lflag & VL_TERMINATE) { 4335 vp->v_lflag |= VL_TERMWANT; 4336 4337 msleep(&vp->v_lflag, &vp->v_lock, PVFS, "vnode getiocount", NULL); 4338 } else 4339 msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_getiocount", NULL); 4340 } 4341 if (withvid && vid != vp->v_id) { 4342 return(ENOENT); 4343 } 4344 if (++vp->v_references >= UNAGE_THRESHHOLD) { 4345 vp->v_references = 0; 4346 vnode_list_remove(vp); 4347 } 4348 vp->v_iocount++; 4349#ifdef JOE_DEBUG 4350 record_vp(vp, 1); 4351#endif 4352 return(0); 4353} 4354 4355static void 4356vnode_dropiocount (vnode_t vp) 4357{ 4358 if (vp->v_iocount < 1) 4359 panic("vnode_dropiocount(%p): v_iocount < 1", vp); 4360 4361 vp->v_iocount--; 4362#ifdef JOE_DEBUG 4363 record_vp(vp, -1); 4364#endif 4365 if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED)) && (vp->v_iocount <= 1)) 4366 wakeup(&vp->v_iocount); 4367} 4368 4369 4370void 4371vnode_reclaim(struct vnode * vp) 4372{ 4373 vnode_reclaim_internal(vp, 0, 0, 0); 4374} 4375 4376__private_extern__ 4377void 4378vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags) 4379{ 4380 int isfifo = 0; 4381 4382 if (!locked) 4383 vnode_lock(vp); 4384 4385 if (vp->v_lflag & VL_TERMINATE) { 4386 panic("vnode reclaim in progress"); 4387 } 4388 vp->v_lflag |= VL_TERMINATE; 4389 4390 vn_clearunionwait(vp, 1); 4391 4392 vnode_drain(vp); 4393 4394 isfifo = (vp->v_type == VFIFO); 4395 4396 if (vp->v_type != VBAD) 4397 vgone(vp, flags); /* clean and reclaim the vnode */ 4398 4399 /* 4400 * give the vnode a new identity so that vnode_getwithvid will fail 4401 * on any stale cache accesses... 4402 * grab the list_lock so that if we're in "new_vnode" 4403 * behind the list_lock trying to steal this vnode, the v_id is stable... 4404 * once new_vnode drops the list_lock, it will block trying to take 4405 * the vnode lock until we release it... at that point it will evaluate 4406 * whether the v_vid has changed 4407 * also need to make sure that the vnode isn't on a list where "new_vnode" 4408 * can find it after the v_id has been bumped until we are completely done 4409 * with the vnode (i.e. putting it back on a list has to be the very last 4410 * thing we do to this vnode... many of the callers of vnode_reclaim_internal 4411 * are holding an io_count on the vnode... they need to drop the io_count 4412 * BEFORE doing a vnode_list_add or make sure to hold the vnode lock until 4413 * they are completely done with the vnode 4414 */ 4415 vnode_list_lock(); 4416 4417 vnode_list_remove_locked(vp); 4418 vp->v_id++; 4419 4420 vnode_list_unlock(); 4421 4422 if (isfifo) { 4423 struct fifoinfo * fip; 4424 4425 fip = vp->v_fifoinfo; 4426 vp->v_fifoinfo = NULL; 4427 FREE(fip, M_TEMP); 4428 } 4429 vp->v_type = VBAD; 4430 4431 if (vp->v_data) 4432 panic("vnode_reclaim_internal: cleaned vnode isn't"); 4433 if (vp->v_numoutput) 4434 panic("vnode_reclaim_internal: clean vnode has pending I/O's"); 4435 if (UBCINFOEXISTS(vp)) 4436 panic("vnode_reclaim_internal: ubcinfo not cleaned"); 4437 if (vp->v_parent) 4438 panic("vnode_reclaim_internal: vparent not removed"); 4439 if (vp->v_name) 4440 panic("vnode_reclaim_internal: vname not removed"); 4441 4442 vp->v_socket = NULL; 4443 4444 vp->v_lflag &= ~VL_TERMINATE; 4445 vp->v_owner = NULL; 4446 4447 KNOTE(&vp->v_knotes, NOTE_REVOKE); 4448 4449 /* Make sure that when we reuse the vnode, no knotes left over */ 4450 klist_init(&vp->v_knotes); 4451 4452 if (vp->v_lflag & VL_TERMWANT) { 4453 vp->v_lflag &= ~VL_TERMWANT; 4454 wakeup(&vp->v_lflag); 4455 } 4456 if (!reuse) { 4457 /* 4458 * make sure we get on the 4459 * dead list if appropriate 4460 */ 4461 vnode_list_add(vp); 4462 } 4463 if (!locked) 4464 vnode_unlock(vp); 4465} 4466 4467/* USAGE: 4468 * The following api creates a vnode and associates all the parameter specified in vnode_fsparam 4469 * structure and returns a vnode handle with a reference. device aliasing is handled here so checkalias 4470 * is obsoleted by this. 4471 */ 4472int 4473vnode_create(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp) 4474{ 4475 int error; 4476 int insert = 1; 4477 vnode_t vp; 4478 vnode_t nvp; 4479 vnode_t dvp; 4480 struct uthread *ut; 4481 struct componentname *cnp; 4482 struct vnode_fsparam *param = (struct vnode_fsparam *)data; 4483#if CONFIG_TRIGGERS 4484 struct vnode_trigger_param *tinfo = NULL; 4485#endif 4486 if (param == NULL) 4487 return (EINVAL); 4488 4489 /* Do quick sanity check on the parameters. */ 4490 if (param->vnfs_vtype == VBAD) { 4491 return EINVAL; 4492 } 4493 4494#if CONFIG_TRIGGERS 4495 if ((flavor == VNCREATE_TRIGGER) && (size == VNCREATE_TRIGGER_SIZE)) { 4496 tinfo = (struct vnode_trigger_param *)data; 4497 4498 /* Validate trigger vnode input */ 4499 if ((param->vnfs_vtype != VDIR) || 4500 (tinfo->vnt_resolve_func == NULL) || 4501 (tinfo->vnt_flags & ~VNT_VALID_MASK)) { 4502 return (EINVAL); 4503 } 4504 /* Fall through a normal create (params will be the same) */ 4505 flavor = VNCREATE_FLAVOR; 4506 size = VCREATESIZE; 4507 } 4508#endif 4509 if ((flavor != VNCREATE_FLAVOR) || (size != VCREATESIZE)) 4510 return (EINVAL); 4511 4512 if ( (error = new_vnode(&vp)) ) 4513 return(error); 4514 4515 dvp = param->vnfs_dvp; 4516 cnp = param->vnfs_cnp; 4517 4518 vp->v_op = param->vnfs_vops; 4519 vp->v_type = param->vnfs_vtype; 4520 vp->v_data = param->vnfs_fsnode; 4521 4522 if (param->vnfs_markroot) 4523 vp->v_flag |= VROOT; 4524 if (param->vnfs_marksystem) 4525 vp->v_flag |= VSYSTEM; 4526 if (vp->v_type == VREG) { 4527 error = ubc_info_init_withsize(vp, param->vnfs_filesize); 4528 if (error) { 4529#ifdef JOE_DEBUG 4530 record_vp(vp, 1); 4531#endif 4532 vp->v_mount = NULL; 4533 vp->v_op = dead_vnodeop_p; 4534 vp->v_tag = VT_NON; 4535 vp->v_data = NULL; 4536 vp->v_type = VBAD; 4537 vp->v_lflag |= VL_DEAD; 4538 4539 vnode_put(vp); 4540 return(error); 4541 } 4542 } 4543#ifdef JOE_DEBUG 4544 record_vp(vp, 1); 4545#endif 4546 4547#if CONFIG_TRIGGERS 4548 /* 4549 * For trigger vnodes, attach trigger info to vnode 4550 */ 4551 if ((vp->v_type == VDIR) && (tinfo != NULL)) { 4552 /* 4553 * Note: has a side effect of incrementing trigger count on the 4554 * mount if successful, which we would need to undo on a 4555 * subsequent failure. 4556 */ 4557#ifdef JOE_DEBUG 4558 record_vp(vp, -1); 4559#endif 4560 error = vnode_resolver_create(param->vnfs_mp, vp, tinfo, FALSE); 4561 if (error) { 4562 printf("vnode_create: vnode_resolver_create() err %d\n", error); 4563 vp->v_mount = NULL; 4564 vp->v_op = dead_vnodeop_p; 4565 vp->v_tag = VT_NON; 4566 vp->v_data = NULL; 4567 vp->v_type = VBAD; 4568 vp->v_lflag |= VL_DEAD; 4569#ifdef JOE_DEBUG 4570 record_vp(vp, 1); 4571#endif 4572 vnode_put(vp); 4573 return (error); 4574 } 4575 } 4576#endif 4577 if (vp->v_type == VCHR || vp->v_type == VBLK) { 4578 4579 vp->v_tag = VT_DEVFS; /* callers will reset if needed (bdevvp) */ 4580 4581 if ( (nvp = checkalias(vp, param->vnfs_rdev)) ) { 4582 /* 4583 * if checkalias returns a vnode, it will be locked 4584 * 4585 * first get rid of the unneeded vnode we acquired 4586 */ 4587 vp->v_data = NULL; 4588 vp->v_op = spec_vnodeop_p; 4589 vp->v_type = VBAD; 4590 vp->v_lflag = VL_DEAD; 4591 vp->v_data = NULL; 4592 vp->v_tag = VT_NON; 4593 vnode_put(vp); 4594 4595 /* 4596 * switch to aliased vnode and finish 4597 * preparing it 4598 */ 4599 vp = nvp; 4600 4601 vclean(vp, 0); 4602 vp->v_op = param->vnfs_vops; 4603 vp->v_type = param->vnfs_vtype; 4604 vp->v_data = param->vnfs_fsnode; 4605 vp->v_lflag = 0; 4606 vp->v_mount = NULL; 4607 insmntque(vp, param->vnfs_mp); 4608 insert = 0; 4609 vnode_unlock(vp); 4610 } 4611 4612 if (VCHR == vp->v_type) { 4613 u_int maj = major(vp->v_rdev); 4614 4615 if (maj < (u_int)nchrdev && cdevsw[maj].d_type == D_TTY) 4616 vp->v_flag |= VISTTY; 4617 } 4618 } 4619 4620 if (vp->v_type == VFIFO) { 4621 struct fifoinfo *fip; 4622 4623 MALLOC(fip, struct fifoinfo *, 4624 sizeof(*fip), M_TEMP, M_WAITOK); 4625 bzero(fip, sizeof(struct fifoinfo )); 4626 vp->v_fifoinfo = fip; 4627 } 4628 /* The file systems must pass the address of the location where 4629 * they store the vnode pointer. When we add the vnode into the mount 4630 * list and name cache they become discoverable. So the file system node 4631 * must have the connection to vnode setup by then 4632 */ 4633 *vpp = vp; 4634 4635 /* Add fs named reference. */ 4636 if (param->vnfs_flags & VNFS_ADDFSREF) { 4637 vp->v_lflag |= VNAMED_FSHASH; 4638 } 4639 if (param->vnfs_mp) { 4640 if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL) 4641 vp->v_flag |= VLOCKLOCAL; 4642 if (insert) { 4643 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) 4644 panic("insmntque: vp on the free list\n"); 4645 4646 /* 4647 * enter in mount vnode list 4648 */ 4649 insmntque(vp, param->vnfs_mp); 4650 } 4651 } 4652 if (dvp && vnode_ref(dvp) == 0) { 4653 vp->v_parent = dvp; 4654 } 4655 if (cnp) { 4656 if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE | VNFS_CANTCACHE)) == 0)) { 4657 /* 4658 * enter into name cache 4659 * we've got the info to enter it into the name cache now 4660 * cache_enter_create will pick up an extra reference on 4661 * the name entered into the string cache 4662 */ 4663 vp->v_name = cache_enter_create(dvp, vp, cnp); 4664 } else 4665 vp->v_name = vfs_addname(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0); 4666 4667 if ((cnp->cn_flags & UNIONCREATED) == UNIONCREATED) 4668 vp->v_flag |= VISUNION; 4669 } 4670 if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) { 4671 /* 4672 * this vnode is being created as cacheable in the name cache 4673 * this allows us to re-enter it in the cache 4674 */ 4675 vp->v_flag |= VNCACHEABLE; 4676 } 4677 ut = get_bsdthread_info(current_thread()); 4678 4679 if ((current_proc()->p_lflag & P_LRAGE_VNODES) || 4680 (ut->uu_flag & UT_RAGE_VNODES)) { 4681 /* 4682 * process has indicated that it wants any 4683 * vnodes created on its behalf to be rapidly 4684 * aged to reduce the impact on the cached set 4685 * of vnodes 4686 */ 4687 vp->v_flag |= VRAGE; 4688 } 4689 return (0); 4690} 4691 4692int 4693vnode_addfsref(vnode_t vp) 4694{ 4695 vnode_lock_spin(vp); 4696 if (vp->v_lflag & VNAMED_FSHASH) 4697 panic("add_fsref: vp already has named reference"); 4698 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) 4699 panic("addfsref: vp on the free list\n"); 4700 vp->v_lflag |= VNAMED_FSHASH; 4701 vnode_unlock(vp); 4702 return(0); 4703 4704} 4705int 4706vnode_removefsref(vnode_t vp) 4707{ 4708 vnode_lock_spin(vp); 4709 if ((vp->v_lflag & VNAMED_FSHASH) == 0) 4710 panic("remove_fsref: no named reference"); 4711 vp->v_lflag &= ~VNAMED_FSHASH; 4712 vnode_unlock(vp); 4713 return(0); 4714 4715} 4716 4717 4718int 4719vfs_iterate(int flags, int (*callout)(mount_t, void *), void *arg) 4720{ 4721 mount_t mp; 4722 int ret = 0; 4723 fsid_t * fsid_list; 4724 int count, actualcount, i; 4725 void * allocmem; 4726 int indx_start, indx_stop, indx_incr; 4727 4728 count = mount_getvfscnt(); 4729 count += 10; 4730 4731 fsid_list = (fsid_t *)kalloc(count * sizeof(fsid_t)); 4732 allocmem = (void *)fsid_list; 4733 4734 actualcount = mount_fillfsids(fsid_list, count); 4735 4736 /* 4737 * Establish the iteration direction 4738 * VFS_ITERATE_TAIL_FIRST overrides default head first order (oldest first) 4739 */ 4740 if (flags & VFS_ITERATE_TAIL_FIRST) { 4741 indx_start = actualcount - 1; 4742 indx_stop = -1; 4743 indx_incr = -1; 4744 } else /* Head first by default */ { 4745 indx_start = 0; 4746 indx_stop = actualcount; 4747 indx_incr = 1; 4748 } 4749 4750 for (i=indx_start; i != indx_stop; i += indx_incr) { 4751 4752 /* obtain the mount point with iteration reference */ 4753 mp = mount_list_lookupby_fsid(&fsid_list[i], 0, 1); 4754 4755 if(mp == (struct mount *)0) 4756 continue; 4757 mount_lock(mp); 4758 if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) { 4759 mount_unlock(mp); 4760 mount_iterdrop(mp); 4761 continue; 4762 4763 } 4764 mount_unlock(mp); 4765 4766 /* iterate over all the vnodes */ 4767 ret = callout(mp, arg); 4768 4769 mount_iterdrop(mp); 4770 4771 switch (ret) { 4772 case VFS_RETURNED: 4773 case VFS_RETURNED_DONE: 4774 if (ret == VFS_RETURNED_DONE) { 4775 ret = 0; 4776 goto out; 4777 } 4778 break; 4779 4780 case VFS_CLAIMED_DONE: 4781 ret = 0; 4782 goto out; 4783 case VFS_CLAIMED: 4784 default: 4785 break; 4786 } 4787 ret = 0; 4788 } 4789 4790out: 4791 kfree(allocmem, (count * sizeof(fsid_t))); 4792 return (ret); 4793} 4794 4795/* 4796 * Update the vfsstatfs structure in the mountpoint. 4797 * MAC: Parameter eventtype added, indicating whether the event that 4798 * triggered this update came from user space, via a system call 4799 * (VFS_USER_EVENT) or an internal kernel call (VFS_KERNEL_EVENT). 4800 */ 4801int 4802vfs_update_vfsstat(mount_t mp, vfs_context_t ctx, __unused int eventtype) 4803{ 4804 struct vfs_attr va; 4805 int error; 4806 4807 /* 4808 * Request the attributes we want to propagate into 4809 * the per-mount vfsstat structure. 4810 */ 4811 VFSATTR_INIT(&va); 4812 VFSATTR_WANTED(&va, f_iosize); 4813 VFSATTR_WANTED(&va, f_blocks); 4814 VFSATTR_WANTED(&va, f_bfree); 4815 VFSATTR_WANTED(&va, f_bavail); 4816 VFSATTR_WANTED(&va, f_bused); 4817 VFSATTR_WANTED(&va, f_files); 4818 VFSATTR_WANTED(&va, f_ffree); 4819 VFSATTR_WANTED(&va, f_bsize); 4820 VFSATTR_WANTED(&va, f_fssubtype); 4821#if CONFIG_MACF 4822 if (eventtype == VFS_USER_EVENT) { 4823 error = mac_mount_check_getattr(ctx, mp, &va); 4824 if (error != 0) 4825 return (error); 4826 } 4827#endif 4828 4829 if ((error = vfs_getattr(mp, &va, ctx)) != 0) { 4830 KAUTH_DEBUG("STAT - filesystem returned error %d", error); 4831 return(error); 4832 } 4833 4834 /* 4835 * Unpack into the per-mount structure. 4836 * 4837 * We only overwrite these fields, which are likely to change: 4838 * f_blocks 4839 * f_bfree 4840 * f_bavail 4841 * f_bused 4842 * f_files 4843 * f_ffree 4844 * 4845 * And these which are not, but which the FS has no other way 4846 * of providing to us: 4847 * f_bsize 4848 * f_iosize 4849 * f_fssubtype 4850 * 4851 */ 4852 if (VFSATTR_IS_SUPPORTED(&va, f_bsize)) { 4853 /* 4822056 - protect against malformed server mount */ 4854 mp->mnt_vfsstat.f_bsize = (va.f_bsize > 0 ? va.f_bsize : 512); 4855 } else { 4856 mp->mnt_vfsstat.f_bsize = mp->mnt_devblocksize; /* default from the device block size */ 4857 } 4858 if (VFSATTR_IS_SUPPORTED(&va, f_iosize)) { 4859 mp->mnt_vfsstat.f_iosize = va.f_iosize; 4860 } else { 4861 mp->mnt_vfsstat.f_iosize = 1024 * 1024; /* 1MB sensible I/O size */ 4862 } 4863 if (VFSATTR_IS_SUPPORTED(&va, f_blocks)) 4864 mp->mnt_vfsstat.f_blocks = va.f_blocks; 4865 if (VFSATTR_IS_SUPPORTED(&va, f_bfree)) 4866 mp->mnt_vfsstat.f_bfree = va.f_bfree; 4867 if (VFSATTR_IS_SUPPORTED(&va, f_bavail)) 4868 mp->mnt_vfsstat.f_bavail = va.f_bavail; 4869 if (VFSATTR_IS_SUPPORTED(&va, f_bused)) 4870 mp->mnt_vfsstat.f_bused = va.f_bused; 4871 if (VFSATTR_IS_SUPPORTED(&va, f_files)) 4872 mp->mnt_vfsstat.f_files = va.f_files; 4873 if (VFSATTR_IS_SUPPORTED(&va, f_ffree)) 4874 mp->mnt_vfsstat.f_ffree = va.f_ffree; 4875 4876 /* this is unlikely to change, but has to be queried for */ 4877 if (VFSATTR_IS_SUPPORTED(&va, f_fssubtype)) 4878 mp->mnt_vfsstat.f_fssubtype = va.f_fssubtype; 4879 4880 return(0); 4881} 4882 4883int 4884mount_list_add(mount_t mp) 4885{ 4886 int res; 4887 4888 mount_list_lock(); 4889 if (system_inshutdown != 0) { 4890 res = -1; 4891 } else { 4892 TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); 4893 nummounts++; 4894 res = 0; 4895 } 4896 mount_list_unlock(); 4897 4898 return res; 4899} 4900 4901void 4902mount_list_remove(mount_t mp) 4903{ 4904 mount_list_lock(); 4905 TAILQ_REMOVE(&mountlist, mp, mnt_list); 4906 nummounts--; 4907 mp->mnt_list.tqe_next = NULL; 4908 mp->mnt_list.tqe_prev = NULL; 4909 mount_list_unlock(); 4910} 4911 4912mount_t 4913mount_lookupby_volfsid(int volfs_id, int withref) 4914{ 4915 mount_t cur_mount = (mount_t)0; 4916 mount_t mp; 4917 4918 mount_list_lock(); 4919 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4920 if (!(mp->mnt_kern_flag & MNTK_UNMOUNT) && 4921 (mp->mnt_kern_flag & MNTK_PATH_FROM_ID) && 4922 (mp->mnt_vfsstat.f_fsid.val[0] == volfs_id)) { 4923 cur_mount = mp; 4924 if (withref) { 4925 if (mount_iterref(cur_mount, 1)) { 4926 cur_mount = (mount_t)0; 4927 mount_list_unlock(); 4928 goto out; 4929 } 4930 } 4931 break; 4932 } 4933 } 4934 mount_list_unlock(); 4935 if (withref && (cur_mount != (mount_t)0)) { 4936 mp = cur_mount; 4937 if (vfs_busy(mp, LK_NOWAIT) != 0) { 4938 cur_mount = (mount_t)0; 4939 } 4940 mount_iterdrop(mp); 4941 } 4942out: 4943 return(cur_mount); 4944} 4945 4946mount_t 4947mount_list_lookupby_fsid(fsid_t *fsid, int locked, int withref) 4948{ 4949 mount_t retmp = (mount_t)0; 4950 mount_t mp; 4951 4952 if (!locked) 4953 mount_list_lock(); 4954 TAILQ_FOREACH(mp, &mountlist, mnt_list) 4955 if (mp->mnt_vfsstat.f_fsid.val[0] == fsid->val[0] && 4956 mp->mnt_vfsstat.f_fsid.val[1] == fsid->val[1]) { 4957 retmp = mp; 4958 if (withref) { 4959 if (mount_iterref(retmp, 1)) 4960 retmp = (mount_t)0; 4961 } 4962 goto out; 4963 } 4964out: 4965 if (!locked) 4966 mount_list_unlock(); 4967 return (retmp); 4968} 4969 4970errno_t 4971vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx) 4972{ 4973 struct nameidata nd; 4974 int error; 4975 u_int32_t ndflags = 0; 4976 4977 if (ctx == NULL) { /* XXX technically an error */ 4978 ctx = vfs_context_current(); 4979 } 4980 4981 if (flags & VNODE_LOOKUP_NOFOLLOW) 4982 ndflags = NOFOLLOW; 4983 else 4984 ndflags = FOLLOW; 4985 4986 if (flags & VNODE_LOOKUP_NOCROSSMOUNT) 4987 ndflags |= NOCROSSMOUNT; 4988 if (flags & VNODE_LOOKUP_DOWHITEOUT) 4989 ndflags |= DOWHITEOUT; 4990 4991 /* XXX AUDITVNPATH1 needed ? */ 4992 NDINIT(&nd, LOOKUP, OP_LOOKUP, ndflags, UIO_SYSSPACE, 4993 CAST_USER_ADDR_T(path), ctx); 4994 4995 if ((error = namei(&nd))) 4996 return (error); 4997 *vpp = nd.ni_vp; 4998 nameidone(&nd); 4999 5000 return (0); 5001} 5002 5003errno_t 5004vnode_open(const char *path, int fmode, int cmode, int flags, vnode_t *vpp, vfs_context_t ctx) 5005{ 5006 struct nameidata nd; 5007 int error; 5008 u_int32_t ndflags = 0; 5009 int lflags = flags; 5010 5011 if (ctx == NULL) { /* XXX technically an error */ 5012 ctx = vfs_context_current(); 5013 } 5014 5015 if (fmode & O_NOFOLLOW) 5016 lflags |= VNODE_LOOKUP_NOFOLLOW; 5017 5018 if (lflags & VNODE_LOOKUP_NOFOLLOW) 5019 ndflags = NOFOLLOW; 5020 else 5021 ndflags = FOLLOW; 5022 5023 if (lflags & VNODE_LOOKUP_NOCROSSMOUNT) 5024 ndflags |= NOCROSSMOUNT; 5025 if (lflags & VNODE_LOOKUP_DOWHITEOUT) 5026 ndflags |= DOWHITEOUT; 5027 5028 /* XXX AUDITVNPATH1 needed ? */ 5029 NDINIT(&nd, LOOKUP, OP_OPEN, ndflags, UIO_SYSSPACE, 5030 CAST_USER_ADDR_T(path), ctx); 5031 5032 if ((error = vn_open(&nd, fmode, cmode))) 5033 *vpp = NULL; 5034 else 5035 *vpp = nd.ni_vp; 5036 5037 return (error); 5038} 5039 5040errno_t 5041vnode_close(vnode_t vp, int flags, vfs_context_t ctx) 5042{ 5043 int error; 5044 5045 if (ctx == NULL) { 5046 ctx = vfs_context_current(); 5047 } 5048 5049 error = vn_close(vp, flags, ctx); 5050 vnode_put(vp); 5051 return (error); 5052} 5053 5054errno_t 5055vnode_mtime(vnode_t vp, struct timespec *mtime, vfs_context_t ctx) 5056{ 5057 struct vnode_attr va; 5058 int error; 5059 5060 VATTR_INIT(&va); 5061 VATTR_WANTED(&va, va_modify_time); 5062 error = vnode_getattr(vp, &va, ctx); 5063 if (!error) 5064 *mtime = va.va_modify_time; 5065 return error; 5066} 5067 5068/* 5069 * Returns: 0 Success 5070 * vnode_getattr:??? 5071 */ 5072errno_t 5073vnode_size(vnode_t vp, off_t *sizep, vfs_context_t ctx) 5074{ 5075 struct vnode_attr va; 5076 int error; 5077 5078 VATTR_INIT(&va); 5079 VATTR_WANTED(&va, va_data_size); 5080 error = vnode_getattr(vp, &va, ctx); 5081 if (!error) 5082 *sizep = va.va_data_size; 5083 return(error); 5084} 5085 5086errno_t 5087vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx) 5088{ 5089 struct vnode_attr va; 5090 5091 VATTR_INIT(&va); 5092 VATTR_SET(&va, va_data_size, size); 5093 va.va_vaflags = ioflag & 0xffff; 5094 return(vnode_setattr(vp, &va, ctx)); 5095} 5096 5097static int 5098vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx) 5099{ 5100 /* Only use compound VNOP for compound operation */ 5101 if (vnode_compound_open_available(dvp) && ((flags & VN_CREATE_DOOPEN) != 0)) { 5102 *vpp = NULLVP; 5103 return VNOP_COMPOUND_OPEN(dvp, vpp, ndp, VNOP_COMPOUND_OPEN_DO_CREATE, fmode, statusp, vap, ctx); 5104 } else { 5105 return VNOP_CREATE(dvp, vpp, &ndp->ni_cnd, vap, ctx); 5106 } 5107} 5108 5109/* 5110 * Create a filesystem object of arbitrary type with arbitrary attributes in 5111 * the spevied directory with the specified name. 5112 * 5113 * Parameters: dvp Pointer to the vnode of the directory 5114 * in which to create the object. 5115 * vpp Pointer to the area into which to 5116 * return the vnode of the created object. 5117 * cnp Component name pointer from the namei 5118 * data structure, containing the name to 5119 * use for the create object. 5120 * vap Pointer to the vnode_attr structure 5121 * describing the object to be created, 5122 * including the type of object. 5123 * flags VN_* flags controlling ACL inheritance 5124 * and whether or not authorization is to 5125 * be required for the operation. 5126 * 5127 * Returns: 0 Success 5128 * !0 errno value 5129 * 5130 * Implicit: *vpp Contains the vnode of the object that 5131 * was created, if successful. 5132 * *cnp May be modified by the underlying VFS. 5133 * *vap May be modified by the underlying VFS. 5134 * modified by either ACL inheritance or 5135 * 5136 * 5137 * be modified, even if the operation is 5138 * 5139 * 5140 * Notes: The kauth_filesec_t in 'vap', if any, is in host byte order. 5141 * 5142 * Modification of '*cnp' and '*vap' by the underlying VFS is 5143 * strongly discouraged. 5144 * 5145 * XXX: This function is a 'vn_*' function; it belongs in vfs_vnops.c 5146 * 5147 * XXX: We should enummerate the possible errno values here, and where 5148 * in the code they originated. 5149 */ 5150errno_t 5151vn_create(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx) 5152{ 5153 errno_t error, old_error; 5154 vnode_t vp = (vnode_t)0; 5155 boolean_t batched; 5156 struct componentname *cnp; 5157 uint32_t defaulted; 5158 5159 cnp = &ndp->ni_cnd; 5160 error = 0; 5161 batched = namei_compound_available(dvp, ndp) ? TRUE : FALSE; 5162 5163 KAUTH_DEBUG("%p CREATE - '%s'", dvp, cnp->cn_nameptr); 5164 5165 if (flags & VN_CREATE_NOINHERIT) 5166 vap->va_vaflags |= VA_NOINHERIT; 5167 if (flags & VN_CREATE_NOAUTH) 5168 vap->va_vaflags |= VA_NOAUTH; 5169 /* 5170 * Handle ACL inheritance, initialize vap. 5171 */ 5172 error = vn_attribute_prepare(dvp, vap, &defaulted, ctx); 5173 if (error) { 5174 return error; 5175 } 5176 5177 if (vap->va_type != VREG && (fmode != 0 || (flags & VN_CREATE_DOOPEN) || statusp)) { 5178 panic("Open parameters, but not a regular file."); 5179 } 5180 if ((fmode != 0) && ((flags & VN_CREATE_DOOPEN) == 0)) { 5181 panic("Mode for open, but not trying to open..."); 5182 } 5183 5184 /* 5185 * Create the requested node. 5186 */ 5187 switch(vap->va_type) { 5188 case VREG: 5189 error = vn_create_reg(dvp, vpp, ndp, vap, flags, fmode, statusp, ctx); 5190 break; 5191 case VDIR: 5192 error = vn_mkdir(dvp, vpp, ndp, vap, ctx); 5193 break; 5194 case VSOCK: 5195 case VFIFO: 5196 case VBLK: 5197 case VCHR: 5198 error = VNOP_MKNOD(dvp, vpp, cnp, vap, ctx); 5199 break; 5200 default: 5201 panic("vnode_create: unknown vtype %d", vap->va_type); 5202 } 5203 if (error != 0) { 5204 KAUTH_DEBUG("%p CREATE - error %d returned by filesystem", dvp, error); 5205 goto out; 5206 } 5207 5208 vp = *vpp; 5209 old_error = error; 5210 5211#if CONFIG_MACF 5212 if (!(flags & VN_CREATE_NOLABEL)) { 5213 error = vnode_label(vnode_mount(vp), dvp, vp, cnp, VNODE_LABEL_CREATE, ctx); 5214 if (error) 5215 goto error; 5216 } 5217#endif 5218 5219 /* 5220 * If some of the requested attributes weren't handled by the VNOP, 5221 * use our fallback code. 5222 */ 5223 if (!VATTR_ALL_SUPPORTED(vap) && *vpp) { 5224 KAUTH_DEBUG(" CREATE - doing fallback with ACL %p", vap->va_acl); 5225 error = vnode_setattr_fallback(*vpp, vap, ctx); 5226 } 5227#if CONFIG_MACF 5228error: 5229#endif 5230 if ((error != 0) && (vp != (vnode_t)0)) { 5231 5232 /* If we've done a compound open, close */ 5233 if (batched && (old_error == 0) && (vap->va_type == VREG)) { 5234 VNOP_CLOSE(vp, fmode, ctx); 5235 } 5236 5237 /* Need to provide notifications if a create succeeded */ 5238 if (!batched) { 5239 *vpp = (vnode_t) 0; 5240 vnode_put(vp); 5241 } 5242 } 5243 5244out: 5245 vn_attribute_cleanup(vap, defaulted); 5246 5247 return(error); 5248} 5249 5250static kauth_scope_t vnode_scope; 5251static int vnode_authorize_callback(kauth_cred_t credential, void *idata, kauth_action_t action, 5252 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3); 5253static int vnode_authorize_callback_int(__unused kauth_cred_t credential, __unused void *idata, kauth_action_t action, 5254 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3); 5255 5256typedef struct _vnode_authorize_context { 5257 vnode_t vp; 5258 struct vnode_attr *vap; 5259 vnode_t dvp; 5260 struct vnode_attr *dvap; 5261 vfs_context_t ctx; 5262 int flags; 5263 int flags_valid; 5264#define _VAC_IS_OWNER (1<<0) 5265#define _VAC_IN_GROUP (1<<1) 5266#define _VAC_IS_DIR_OWNER (1<<2) 5267#define _VAC_IN_DIR_GROUP (1<<3) 5268} *vauth_ctx; 5269 5270void 5271vnode_authorize_init(void) 5272{ 5273 vnode_scope = kauth_register_scope(KAUTH_SCOPE_VNODE, vnode_authorize_callback, NULL); 5274} 5275 5276#define VATTR_PREPARE_DEFAULTED_UID 0x1 5277#define VATTR_PREPARE_DEFAULTED_GID 0x2 5278#define VATTR_PREPARE_DEFAULTED_MODE 0x4 5279 5280int 5281vn_attribute_prepare(vnode_t dvp, struct vnode_attr *vap, uint32_t *defaulted_fieldsp, vfs_context_t ctx) 5282{ 5283 kauth_acl_t nacl = NULL, oacl = NULL; 5284 int error; 5285 5286 /* 5287 * Handle ACL inheritance. 5288 */ 5289 if (!(vap->va_vaflags & VA_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) { 5290 /* save the original filesec */ 5291 if (VATTR_IS_ACTIVE(vap, va_acl)) { 5292 oacl = vap->va_acl; 5293 } 5294 5295 vap->va_acl = NULL; 5296 if ((error = kauth_acl_inherit(dvp, 5297 oacl, 5298 &nacl, 5299 vap->va_type == VDIR, 5300 ctx)) != 0) { 5301 KAUTH_DEBUG("%p CREATE - error %d processing inheritance", dvp, error); 5302 return(error); 5303 } 5304 5305 /* 5306 * If the generated ACL is NULL, then we can save ourselves some effort 5307 * by clearing the active bit. 5308 */ 5309 if (nacl == NULL) { 5310 VATTR_CLEAR_ACTIVE(vap, va_acl); 5311 } else { 5312 vap->va_base_acl = oacl; 5313 VATTR_SET(vap, va_acl, nacl); 5314 } 5315 } 5316 5317 error = vnode_authattr_new_internal(dvp, vap, (vap->va_vaflags & VA_NOAUTH), defaulted_fieldsp, ctx); 5318 if (error) { 5319 vn_attribute_cleanup(vap, *defaulted_fieldsp); 5320 } 5321 5322 return error; 5323} 5324 5325void 5326vn_attribute_cleanup(struct vnode_attr *vap, uint32_t defaulted_fields) 5327{ 5328 /* 5329 * If the caller supplied a filesec in vap, it has been replaced 5330 * now by the post-inheritance copy. We need to put the original back 5331 * and free the inherited product. 5332 */ 5333 kauth_acl_t nacl, oacl; 5334 5335 if (VATTR_IS_ACTIVE(vap, va_acl)) { 5336 nacl = vap->va_acl; 5337 oacl = vap->va_base_acl; 5338 5339 if (oacl) { 5340 VATTR_SET(vap, va_acl, oacl); 5341 vap->va_base_acl = NULL; 5342 } else { 5343 VATTR_CLEAR_ACTIVE(vap, va_acl); 5344 } 5345 5346 if (nacl != NULL) { 5347 kauth_acl_free(nacl); 5348 } 5349 } 5350 5351 if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_MODE) != 0) { 5352 VATTR_CLEAR_ACTIVE(vap, va_mode); 5353 } 5354 if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_GID) != 0) { 5355 VATTR_CLEAR_ACTIVE(vap, va_gid); 5356 } 5357 if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_UID) != 0) { 5358 VATTR_CLEAR_ACTIVE(vap, va_uid); 5359 } 5360 5361 return; 5362} 5363 5364int 5365vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, __unused void *reserved) 5366{ 5367#if !CONFIG_MACF 5368#pragma unused(cnp) 5369#endif 5370 int error = 0; 5371 5372 /* 5373 * Normally, unlinking of directories is not supported. 5374 * However, some file systems may have limited support. 5375 */ 5376 if ((vp->v_type == VDIR) && 5377 !(vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSDIRLINKS)) { 5378 return (EPERM); /* POSIX */ 5379 } 5380 5381 /* authorize the delete operation */ 5382#if CONFIG_MACF 5383 if (!error) 5384 error = mac_vnode_check_unlink(ctx, dvp, vp, cnp); 5385#endif /* MAC */ 5386 if (!error) 5387 error = vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx); 5388 5389 return error; 5390} 5391 5392int 5393vn_authorize_open_existing(vnode_t vp, struct componentname *cnp, int fmode, vfs_context_t ctx, void *reserved) 5394{ 5395 /* Open of existing case */ 5396 kauth_action_t action; 5397 int error = 0; 5398 if (cnp->cn_ndp == NULL) { 5399 panic("NULL ndp"); 5400 } 5401 if (reserved != NULL) { 5402 panic("reserved not NULL."); 5403 } 5404 5405#if CONFIG_MACF 5406 /* XXX may do duplicate work here, but ignore that for now (idempotent) */ 5407 if (vfs_flags(vnode_mount(vp)) & MNT_MULTILABEL) { 5408 error = vnode_label(vnode_mount(vp), NULL, vp, NULL, 0, ctx); 5409 if (error) 5410 return (error); 5411 } 5412#endif 5413 5414 if ( (fmode & O_DIRECTORY) && vp->v_type != VDIR ) { 5415 return (ENOTDIR); 5416 } 5417 5418 if (vp->v_type == VSOCK && vp->v_tag != VT_FDESC) { 5419 return (EOPNOTSUPP); /* Operation not supported on socket */ 5420 } 5421 5422 if (vp->v_type == VLNK && (fmode & O_NOFOLLOW) != 0) { 5423 return (ELOOP); /* O_NOFOLLOW was specified and the target is a symbolic link */ 5424 } 5425 5426 /* disallow write operations on directories */ 5427 if (vnode_isdir(vp) && (fmode & (FWRITE | O_TRUNC))) { 5428 return (EISDIR); 5429 } 5430 5431 if ((cnp->cn_ndp->ni_flag & NAMEI_TRAILINGSLASH)) { 5432 if (vp->v_type != VDIR) { 5433 return (ENOTDIR); 5434 } 5435 } 5436 5437#if CONFIG_MACF 5438 /* If a file being opened is a shadow file containing 5439 * namedstream data, ignore the macf checks because it 5440 * is a kernel internal file and access should always 5441 * be allowed. 5442 */ 5443 if (!(vnode_isshadow(vp) && vnode_isnamedstream(vp))) { 5444 error = mac_vnode_check_open(ctx, vp, fmode); 5445 if (error) { 5446 return (error); 5447 } 5448 } 5449#endif 5450 5451 /* compute action to be authorized */ 5452 action = 0; 5453 if (fmode & FREAD) { 5454 action |= KAUTH_VNODE_READ_DATA; 5455 } 5456 if (fmode & (FWRITE | O_TRUNC)) { 5457 /* 5458 * If we are writing, appending, and not truncating, 5459 * indicate that we are appending so that if the 5460 * UF_APPEND or SF_APPEND bits are set, we do not deny 5461 * the open. 5462 */ 5463 if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) { 5464 action |= KAUTH_VNODE_APPEND_DATA; 5465 } else { 5466 action |= KAUTH_VNODE_WRITE_DATA; 5467 } 5468 } 5469 error = vnode_authorize(vp, NULL, action, ctx); 5470#if NAMEDSTREAMS 5471 if (error == EACCES) { 5472 /* 5473 * Shadow files may exist on-disk with a different UID/GID 5474 * than that of the current context. Verify that this file 5475 * is really a shadow file. If it was created successfully 5476 * then it should be authorized. 5477 */ 5478 if (vnode_isshadow(vp) && vnode_isnamedstream (vp)) { 5479 error = vnode_verifynamedstream(vp); 5480 } 5481 } 5482#endif 5483 5484 return error; 5485} 5486 5487int 5488vn_authorize_create(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved) 5489{ 5490#if !CONFIG_MACF 5491#pragma unused(vap) 5492#endif 5493 /* Creation case */ 5494 int error; 5495 5496 if (cnp->cn_ndp == NULL) { 5497 panic("NULL cn_ndp"); 5498 } 5499 if (reserved != NULL) { 5500 panic("reserved not NULL."); 5501 } 5502 5503 /* Only validate path for creation if we didn't do a complete lookup */ 5504 if (cnp->cn_ndp->ni_flag & NAMEI_UNFINISHED) { 5505 error = lookup_validate_creation_path(cnp->cn_ndp); 5506 if (error) 5507 return (error); 5508 } 5509 5510#if CONFIG_MACF 5511 error = mac_vnode_check_create(ctx, dvp, cnp, vap); 5512 if (error) 5513 return (error); 5514#endif /* CONFIG_MACF */ 5515 5516 return (vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)); 5517} 5518 5519int 5520vn_authorize_rename(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, 5521 struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, 5522 vfs_context_t ctx, void *reserved) 5523{ 5524 int error = 0; 5525 int moving = 0; 5526 5527 if (reserved != NULL) { 5528 panic("Passed something other than NULL as reserved field!"); 5529 } 5530 5531 /* 5532 * Avoid renaming "." and "..". 5533 * 5534 * XXX No need to check for this in the FS. We should always have the leaves 5535 * in VFS in this case. 5536 */ 5537 if (fvp->v_type == VDIR && 5538 ((fdvp == fvp) || 5539 (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || 5540 ((fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT)) ) { 5541 error = EINVAL; 5542 goto out; 5543 } 5544 5545 if (tvp == NULLVP && vnode_compound_rename_available(tdvp)) { 5546 error = lookup_validate_creation_path(tcnp->cn_ndp); 5547 if (error) 5548 goto out; 5549 } 5550 5551 /***** <MACF> *****/ 5552#if CONFIG_MACF 5553 error = mac_vnode_check_rename_from(ctx, fdvp, fvp, fcnp); 5554 if (error) 5555 goto out; 5556#endif 5557 5558#if CONFIG_MACF 5559 error = mac_vnode_check_rename_to(ctx, 5560 tdvp, tvp, fdvp == tdvp, tcnp); 5561 if (error) 5562 goto out; 5563#endif 5564 /***** </MACF> *****/ 5565 5566 /***** <MiscChecks> *****/ 5567 if (tvp != NULL) { 5568 if (fvp->v_type == VDIR && tvp->v_type != VDIR) { 5569 error = ENOTDIR; 5570 goto out; 5571 } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { 5572 error = EISDIR; 5573 goto out; 5574 } 5575 } 5576 5577 if (fvp == tdvp) { 5578 error = EINVAL; 5579 goto out; 5580 } 5581 5582 /* 5583 * The following edge case is caught here: 5584 * (to cannot be a descendent of from) 5585 * 5586 * o fdvp 5587 * / 5588 * / 5589 * o fvp 5590 * \ 5591 * \ 5592 * o tdvp 5593 * / 5594 * / 5595 * o tvp 5596 */ 5597 if (tdvp->v_parent == fvp) { 5598 error = EINVAL; 5599 goto out; 5600 } 5601 /***** </MiscChecks> *****/ 5602 5603 /***** <Kauth> *****/ 5604 5605 error = 0; 5606 if ((tvp != NULL) && vnode_isdir(tvp)) { 5607 if (tvp != fdvp) 5608 moving = 1; 5609 } else if (tdvp != fdvp) { 5610 moving = 1; 5611 } 5612 5613 5614 /* 5615 * must have delete rights to remove the old name even in 5616 * the simple case of fdvp == tdvp. 5617 * 5618 * If fvp is a directory, and we are changing it's parent, 5619 * then we also need rights to rewrite its ".." entry as well. 5620 */ 5621 if (vnode_isdir(fvp)) { 5622 if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) 5623 goto out; 5624 } else { 5625 if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE, ctx)) != 0) 5626 goto out; 5627 } 5628 if (moving) { 5629 /* moving into tdvp or tvp, must have rights to add */ 5630 if ((error = vnode_authorize(((tvp != NULL) && vnode_isdir(tvp)) ? tvp : tdvp, 5631 NULL, 5632 vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, 5633 ctx)) != 0) { 5634 goto out; 5635 } 5636 } else { 5637 /* node staying in same directory, must be allowed to add new name */ 5638 if ((error = vnode_authorize(fdvp, NULL, 5639 vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, ctx)) != 0) 5640 goto out; 5641 } 5642 /* overwriting tvp */ 5643 if ((tvp != NULL) && !vnode_isdir(tvp) && 5644 ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0)) { 5645 goto out; 5646 } 5647 5648 /***** </Kauth> *****/ 5649 5650 /* XXX more checks? */ 5651out: 5652 return error; 5653} 5654 5655int 5656vn_authorize_mkdir(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved) 5657{ 5658#if !CONFIG_MACF 5659#pragma unused(vap) 5660#endif 5661 int error; 5662 5663 if (reserved != NULL) { 5664 panic("reserved not NULL in vn_authorize_mkdir()"); 5665 } 5666 5667 /* XXX A hack for now, to make shadow files work */ 5668 if (cnp->cn_ndp == NULL) { 5669 return 0; 5670 } 5671 5672 if (vnode_compound_mkdir_available(dvp)) { 5673 error = lookup_validate_creation_path(cnp->cn_ndp); 5674 if (error) 5675 goto out; 5676 } 5677 5678#if CONFIG_MACF 5679 error = mac_vnode_check_create(ctx, 5680 dvp, cnp, vap); 5681 if (error) 5682 goto out; 5683#endif 5684 5685 /* authorize addition of a directory to the parent */ 5686 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) 5687 goto out; 5688 5689out: 5690 return error; 5691} 5692 5693int 5694vn_authorize_rmdir(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved) 5695{ 5696#if CONFIG_MACF 5697 int error; 5698#else 5699#pragma unused(cnp) 5700#endif 5701 if (reserved != NULL) { 5702 panic("Non-NULL reserved argument to vn_authorize_rmdir()"); 5703 } 5704 5705 if (vp->v_type != VDIR) { 5706 /* 5707 * rmdir only deals with directories 5708 */ 5709 return ENOTDIR; 5710 } 5711 5712 if (dvp == vp) { 5713 /* 5714 * No rmdir "." please. 5715 */ 5716 return EINVAL; 5717 } 5718 5719#if CONFIG_MACF 5720 error = mac_vnode_check_unlink(ctx, dvp, 5721 vp, cnp); 5722 if (error) 5723 return error; 5724#endif 5725 5726 return vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx); 5727} 5728 5729/* 5730 * Authorize an operation on a vnode. 5731 * 5732 * This is KPI, but here because it needs vnode_scope. 5733 * 5734 * Returns: 0 Success 5735 * kauth_authorize_action:EPERM ... 5736 * xlate => EACCES Permission denied 5737 * kauth_authorize_action:0 Success 5738 * kauth_authorize_action: Depends on callback return; this is 5739 * usually only vnode_authorize_callback(), 5740 * but may include other listerners, if any 5741 * exist. 5742 * EROFS 5743 * EACCES 5744 * EPERM 5745 * ??? 5746 */ 5747int 5748vnode_authorize(vnode_t vp, vnode_t dvp, kauth_action_t action, vfs_context_t ctx) 5749{ 5750 int error, result; 5751 5752 /* 5753 * We can't authorize against a dead vnode; allow all operations through so that 5754 * the correct error can be returned. 5755 */ 5756 if (vp->v_type == VBAD) 5757 return(0); 5758 5759 error = 0; 5760 result = kauth_authorize_action(vnode_scope, vfs_context_ucred(ctx), action, 5761 (uintptr_t)ctx, (uintptr_t)vp, (uintptr_t)dvp, (uintptr_t)&error); 5762 if (result == EPERM) /* traditional behaviour */ 5763 result = EACCES; 5764 /* did the lower layers give a better error return? */ 5765 if ((result != 0) && (error != 0)) 5766 return(error); 5767 return(result); 5768} 5769 5770/* 5771 * Test for vnode immutability. 5772 * 5773 * The 'append' flag is set when the authorization request is constrained 5774 * to operations which only request the right to append to a file. 5775 * 5776 * The 'ignore' flag is set when an operation modifying the immutability flags 5777 * is being authorized. We check the system securelevel to determine which 5778 * immutability flags we can ignore. 5779 */ 5780static int 5781vnode_immutable(struct vnode_attr *vap, int append, int ignore) 5782{ 5783 int mask; 5784 5785 /* start with all bits precluding the operation */ 5786 mask = IMMUTABLE | APPEND; 5787 5788 /* if appending only, remove the append-only bits */ 5789 if (append) 5790 mask &= ~APPEND; 5791 5792 /* ignore only set when authorizing flags changes */ 5793 if (ignore) { 5794 if (securelevel <= 0) { 5795 /* in insecure state, flags do not inhibit changes */ 5796 mask = 0; 5797 } else { 5798 /* in secure state, user flags don't inhibit */ 5799 mask &= ~(UF_IMMUTABLE | UF_APPEND); 5800 } 5801 } 5802 KAUTH_DEBUG("IMMUTABLE - file flags 0x%x mask 0x%x append = %d ignore = %d", vap->va_flags, mask, append, ignore); 5803 if ((vap->va_flags & mask) != 0) 5804 return(EPERM); 5805 return(0); 5806} 5807 5808static int 5809vauth_node_owner(struct vnode_attr *vap, kauth_cred_t cred) 5810{ 5811 int result; 5812 5813 /* default assumption is not-owner */ 5814 result = 0; 5815 5816 /* 5817 * If the filesystem has given us a UID, we treat this as authoritative. 5818 */ 5819 if (vap && VATTR_IS_SUPPORTED(vap, va_uid)) { 5820 result = (vap->va_uid == kauth_cred_getuid(cred)) ? 1 : 0; 5821 } 5822 /* we could test the owner UUID here if we had a policy for it */ 5823 5824 return(result); 5825} 5826 5827/* 5828 * vauth_node_group 5829 * 5830 * Description: Ask if a cred is a member of the group owning the vnode object 5831 * 5832 * Parameters: vap vnode attribute 5833 * vap->va_gid group owner of vnode object 5834 * cred credential to check 5835 * ismember pointer to where to put the answer 5836 * idontknow Return this if we can't get an answer 5837 * 5838 * Returns: 0 Success 5839 * idontknow Can't get information 5840 * kauth_cred_ismember_gid:? Error from kauth subsystem 5841 * kauth_cred_ismember_gid:? Error from kauth subsystem 5842 */ 5843static int 5844vauth_node_group(struct vnode_attr *vap, kauth_cred_t cred, int *ismember, int idontknow) 5845{ 5846 int error; 5847 int result; 5848 5849 error = 0; 5850 result = 0; 5851 5852 /* 5853 * The caller is expected to have asked the filesystem for a group 5854 * at some point prior to calling this function. The answer may 5855 * have been that there is no group ownership supported for the 5856 * vnode object, in which case we return 5857 */ 5858 if (vap && VATTR_IS_SUPPORTED(vap, va_gid)) { 5859 error = kauth_cred_ismember_gid(cred, vap->va_gid, &result); 5860 /* 5861 * Credentials which are opted into external group membership 5862 * resolution which are not known to the external resolver 5863 * will result in an ENOENT error. We translate this into 5864 * the appropriate 'idontknow' response for our caller. 5865 * 5866 * XXX We do not make a distinction here between an ENOENT 5867 * XXX arising from a response from the external resolver, 5868 * XXX and an ENOENT which is internally generated. This is 5869 * XXX a deficiency of the published kauth_cred_ismember_gid() 5870 * XXX KPI which can not be overcome without new KPI. For 5871 * XXX all currently known cases, however, this wil result 5872 * XXX in correct behaviour. 5873 */ 5874 if (error == ENOENT) 5875 error = idontknow; 5876 } 5877 /* 5878 * XXX We could test the group UUID here if we had a policy for it, 5879 * XXX but this is problematic from the perspective of synchronizing 5880 * XXX group UUID and POSIX GID ownership of a file and keeping the 5881 * XXX values coherent over time. The problem is that the local 5882 * XXX system will vend transient group UUIDs for unknown POSIX GID 5883 * XXX values, and these are not persistent, whereas storage of values 5884 * XXX is persistent. One potential solution to this is a local 5885 * XXX (persistent) replica of remote directory entries and vended 5886 * XXX local ids in a local directory server (think in terms of a 5887 * XXX caching DNS server). 5888 */ 5889 5890 if (!error) 5891 *ismember = result; 5892 return(error); 5893} 5894 5895static int 5896vauth_file_owner(vauth_ctx vcp) 5897{ 5898 int result; 5899 5900 if (vcp->flags_valid & _VAC_IS_OWNER) { 5901 result = (vcp->flags & _VAC_IS_OWNER) ? 1 : 0; 5902 } else { 5903 result = vauth_node_owner(vcp->vap, vcp->ctx->vc_ucred); 5904 5905 /* cache our result */ 5906 vcp->flags_valid |= _VAC_IS_OWNER; 5907 if (result) { 5908 vcp->flags |= _VAC_IS_OWNER; 5909 } else { 5910 vcp->flags &= ~_VAC_IS_OWNER; 5911 } 5912 } 5913 return(result); 5914} 5915 5916 5917/* 5918 * vauth_file_ingroup 5919 * 5920 * Description: Ask if a user is a member of the group owning the directory 5921 * 5922 * Parameters: vcp The vnode authorization context that 5923 * contains the user and directory info 5924 * vcp->flags_valid Valid flags 5925 * vcp->flags Flags values 5926 * vcp->vap File vnode attributes 5927 * vcp->ctx VFS Context (for user) 5928 * ismember pointer to where to put the answer 5929 * idontknow Return this if we can't get an answer 5930 * 5931 * Returns: 0 Success 5932 * vauth_node_group:? Error from vauth_node_group() 5933 * 5934 * Implicit returns: *ismember 0 The user is not a group member 5935 * 1 The user is a group member 5936 */ 5937static int 5938vauth_file_ingroup(vauth_ctx vcp, int *ismember, int idontknow) 5939{ 5940 int error; 5941 5942 /* Check for a cached answer first, to avoid the check if possible */ 5943 if (vcp->flags_valid & _VAC_IN_GROUP) { 5944 *ismember = (vcp->flags & _VAC_IN_GROUP) ? 1 : 0; 5945 error = 0; 5946 } else { 5947 /* Otherwise, go look for it */ 5948 error = vauth_node_group(vcp->vap, vcp->ctx->vc_ucred, ismember, idontknow); 5949 5950 if (!error) { 5951 /* cache our result */ 5952 vcp->flags_valid |= _VAC_IN_GROUP; 5953 if (*ismember) { 5954 vcp->flags |= _VAC_IN_GROUP; 5955 } else { 5956 vcp->flags &= ~_VAC_IN_GROUP; 5957 } 5958 } 5959 5960 } 5961 return(error); 5962} 5963 5964static int 5965vauth_dir_owner(vauth_ctx vcp) 5966{ 5967 int result; 5968 5969 if (vcp->flags_valid & _VAC_IS_DIR_OWNER) { 5970 result = (vcp->flags & _VAC_IS_DIR_OWNER) ? 1 : 0; 5971 } else { 5972 result = vauth_node_owner(vcp->dvap, vcp->ctx->vc_ucred); 5973 5974 /* cache our result */ 5975 vcp->flags_valid |= _VAC_IS_DIR_OWNER; 5976 if (result) { 5977 vcp->flags |= _VAC_IS_DIR_OWNER; 5978 } else { 5979 vcp->flags &= ~_VAC_IS_DIR_OWNER; 5980 } 5981 } 5982 return(result); 5983} 5984 5985/* 5986 * vauth_dir_ingroup 5987 * 5988 * Description: Ask if a user is a member of the group owning the directory 5989 * 5990 * Parameters: vcp The vnode authorization context that 5991 * contains the user and directory info 5992 * vcp->flags_valid Valid flags 5993 * vcp->flags Flags values 5994 * vcp->dvap Dir vnode attributes 5995 * vcp->ctx VFS Context (for user) 5996 * ismember pointer to where to put the answer 5997 * idontknow Return this if we can't get an answer 5998 * 5999 * Returns: 0 Success 6000 * vauth_node_group:? Error from vauth_node_group() 6001 * 6002 * Implicit returns: *ismember 0 The user is not a group member 6003 * 1 The user is a group member 6004 */ 6005static int 6006vauth_dir_ingroup(vauth_ctx vcp, int *ismember, int idontknow) 6007{ 6008 int error; 6009 6010 /* Check for a cached answer first, to avoid the check if possible */ 6011 if (vcp->flags_valid & _VAC_IN_DIR_GROUP) { 6012 *ismember = (vcp->flags & _VAC_IN_DIR_GROUP) ? 1 : 0; 6013 error = 0; 6014 } else { 6015 /* Otherwise, go look for it */ 6016 error = vauth_node_group(vcp->dvap, vcp->ctx->vc_ucred, ismember, idontknow); 6017 6018 if (!error) { 6019 /* cache our result */ 6020 vcp->flags_valid |= _VAC_IN_DIR_GROUP; 6021 if (*ismember) { 6022 vcp->flags |= _VAC_IN_DIR_GROUP; 6023 } else { 6024 vcp->flags &= ~_VAC_IN_DIR_GROUP; 6025 } 6026 } 6027 } 6028 return(error); 6029} 6030 6031/* 6032 * Test the posix permissions in (vap) to determine whether (credential) 6033 * may perform (action) 6034 */ 6035static int 6036vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir) 6037{ 6038 struct vnode_attr *vap; 6039 int needed, error, owner_ok, group_ok, world_ok, ismember; 6040#ifdef KAUTH_DEBUG_ENABLE 6041 const char *where = "uninitialized"; 6042# define _SETWHERE(c) where = c; 6043#else 6044# define _SETWHERE(c) 6045#endif 6046 6047 /* checking file or directory? */ 6048 if (on_dir) { 6049 vap = vcp->dvap; 6050 } else { 6051 vap = vcp->vap; 6052 } 6053 6054 error = 0; 6055 6056 /* 6057 * We want to do as little work here as possible. So first we check 6058 * which sets of permissions grant us the access we need, and avoid checking 6059 * whether specific permissions grant access when more generic ones would. 6060 */ 6061 6062 /* owner permissions */ 6063 needed = 0; 6064 if (action & VREAD) 6065 needed |= S_IRUSR; 6066 if (action & VWRITE) 6067 needed |= S_IWUSR; 6068 if (action & VEXEC) 6069 needed |= S_IXUSR; 6070 owner_ok = (needed & vap->va_mode) == needed; 6071 6072 /* group permissions */ 6073 needed = 0; 6074 if (action & VREAD) 6075 needed |= S_IRGRP; 6076 if (action & VWRITE) 6077 needed |= S_IWGRP; 6078 if (action & VEXEC) 6079 needed |= S_IXGRP; 6080 group_ok = (needed & vap->va_mode) == needed; 6081 6082 /* world permissions */ 6083 needed = 0; 6084 if (action & VREAD) 6085 needed |= S_IROTH; 6086 if (action & VWRITE) 6087 needed |= S_IWOTH; 6088 if (action & VEXEC) 6089 needed |= S_IXOTH; 6090 world_ok = (needed & vap->va_mode) == needed; 6091 6092 /* If granted/denied by all three, we're done */ 6093 if (owner_ok && group_ok && world_ok) { 6094 _SETWHERE("all"); 6095 goto out; 6096 } 6097 if (!owner_ok && !group_ok && !world_ok) { 6098 _SETWHERE("all"); 6099 error = EACCES; 6100 goto out; 6101 } 6102 6103 /* Check ownership (relatively cheap) */ 6104 if ((on_dir && vauth_dir_owner(vcp)) || 6105 (!on_dir && vauth_file_owner(vcp))) { 6106 _SETWHERE("user"); 6107 if (!owner_ok) 6108 error = EACCES; 6109 goto out; 6110 } 6111 6112 /* Not owner; if group and world both grant it we're done */ 6113 if (group_ok && world_ok) { 6114 _SETWHERE("group/world"); 6115 goto out; 6116 } 6117 if (!group_ok && !world_ok) { 6118 _SETWHERE("group/world"); 6119 error = EACCES; 6120 goto out; 6121 } 6122 6123 /* Check group membership (most expensive) */ 6124 ismember = 0; /* Default to allow, if the target has no group owner */ 6125 6126 /* 6127 * In the case we can't get an answer about the user from the call to 6128 * vauth_dir_ingroup() or vauth_file_ingroup(), we want to fail on 6129 * the side of caution, rather than simply granting access, or we will 6130 * fail to correctly implement exclusion groups, so we set the third 6131 * parameter on the basis of the state of 'group_ok'. 6132 */ 6133 if (on_dir) { 6134 error = vauth_dir_ingroup(vcp, &ismember, (!group_ok ? EACCES : 0)); 6135 } else { 6136 error = vauth_file_ingroup(vcp, &ismember, (!group_ok ? EACCES : 0)); 6137 } 6138 if (error) { 6139 if (!group_ok) 6140 ismember = 1; 6141 error = 0; 6142 } 6143 if (ismember) { 6144 _SETWHERE("group"); 6145 if (!group_ok) 6146 error = EACCES; 6147 goto out; 6148 } 6149 6150 /* Not owner, not in group, use world result */ 6151 _SETWHERE("world"); 6152 if (!world_ok) 6153 error = EACCES; 6154 6155 /* FALLTHROUGH */ 6156 6157out: 6158 KAUTH_DEBUG("%p %s - posix %s permissions : need %s%s%s %x have %s%s%s%s%s%s%s%s%s UID = %d file = %d,%d", 6159 vcp->vp, (error == 0) ? "ALLOWED" : "DENIED", where, 6160 (action & VREAD) ? "r" : "-", 6161 (action & VWRITE) ? "w" : "-", 6162 (action & VEXEC) ? "x" : "-", 6163 needed, 6164 (vap->va_mode & S_IRUSR) ? "r" : "-", 6165 (vap->va_mode & S_IWUSR) ? "w" : "-", 6166 (vap->va_mode & S_IXUSR) ? "x" : "-", 6167 (vap->va_mode & S_IRGRP) ? "r" : "-", 6168 (vap->va_mode & S_IWGRP) ? "w" : "-", 6169 (vap->va_mode & S_IXGRP) ? "x" : "-", 6170 (vap->va_mode & S_IROTH) ? "r" : "-", 6171 (vap->va_mode & S_IWOTH) ? "w" : "-", 6172 (vap->va_mode & S_IXOTH) ? "x" : "-", 6173 kauth_cred_getuid(vcp->ctx->vc_ucred), 6174 on_dir ? vcp->dvap->va_uid : vcp->vap->va_uid, 6175 on_dir ? vcp->dvap->va_gid : vcp->vap->va_gid); 6176 return(error); 6177} 6178 6179/* 6180 * Authorize the deletion of the node vp from the directory dvp. 6181 * 6182 * We assume that: 6183 * - Neither the node nor the directory are immutable. 6184 * - The user is not the superuser. 6185 * 6186 * Deletion is not permitted if the directory is sticky and the caller is 6187 * not owner of the node or directory. 6188 * 6189 * If either the node grants DELETE, or the directory grants DELETE_CHILD, 6190 * the node may be deleted. If neither denies the permission, and the 6191 * caller has Posix write access to the directory, then the node may be 6192 * deleted. 6193 * 6194 * As an optimization, we cache whether or not delete child is permitted 6195 * on directories without the sticky bit set. 6196 */ 6197int 6198vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child); 6199/*static*/ int 6200vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) 6201{ 6202 struct vnode_attr *vap = vcp->vap; 6203 struct vnode_attr *dvap = vcp->dvap; 6204 kauth_cred_t cred = vcp->ctx->vc_ucred; 6205 struct kauth_acl_eval eval; 6206 int error, delete_denied, delete_child_denied, ismember; 6207 6208 /* check the ACL on the directory */ 6209 delete_child_denied = 0; 6210 if (!cached_delete_child && VATTR_IS_NOT(dvap, va_acl, NULL)) { 6211 eval.ae_requested = KAUTH_VNODE_DELETE_CHILD; 6212 eval.ae_acl = &dvap->va_acl->acl_ace[0]; 6213 eval.ae_count = dvap->va_acl->acl_entrycount; 6214 eval.ae_options = 0; 6215 if (vauth_dir_owner(vcp)) 6216 eval.ae_options |= KAUTH_AEVAL_IS_OWNER; 6217 /* 6218 * We use ENOENT as a marker to indicate we could not get 6219 * information in order to delay evaluation until after we 6220 * have the ACL evaluation answer. Previously, we would 6221 * always deny the operation at this point. 6222 */ 6223 if ((error = vauth_dir_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) 6224 return(error); 6225 if (error == ENOENT) 6226 eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN; 6227 else if (ismember) 6228 eval.ae_options |= KAUTH_AEVAL_IN_GROUP; 6229 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; 6230 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; 6231 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS; 6232 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS; 6233 6234 /* 6235 * If there is no entry, we are going to defer to other 6236 * authorization mechanisms. 6237 */ 6238 error = kauth_acl_evaluate(cred, &eval); 6239 6240 if (error != 0) { 6241 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error); 6242 return(error); 6243 } 6244 switch(eval.ae_result) { 6245 case KAUTH_RESULT_DENY: 6246 delete_child_denied = 1; 6247 break; 6248 /* FALLSTHROUGH */ 6249 case KAUTH_RESULT_ALLOW: 6250 KAUTH_DEBUG("%p ALLOWED - granted by directory ACL", vcp->vp); 6251 return(0); 6252 case KAUTH_RESULT_DEFER: 6253 default: 6254 /* Effectively the same as !delete_child_denied */ 6255 KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp); 6256 break; 6257 } 6258 } 6259 6260 /* check the ACL on the node */ 6261 delete_denied = 0; 6262 if (VATTR_IS_NOT(vap, va_acl, NULL)) { 6263 eval.ae_requested = KAUTH_VNODE_DELETE; 6264 eval.ae_acl = &vap->va_acl->acl_ace[0]; 6265 eval.ae_count = vap->va_acl->acl_entrycount; 6266 eval.ae_options = 0; 6267 if (vauth_file_owner(vcp)) 6268 eval.ae_options |= KAUTH_AEVAL_IS_OWNER; 6269 /* 6270 * We use ENOENT as a marker to indicate we could not get 6271 * information in order to delay evaluation until after we 6272 * have the ACL evaluation answer. Previously, we would 6273 * always deny the operation at this point. 6274 */ 6275 if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) 6276 return(error); 6277 if (error == ENOENT) 6278 eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN; 6279 else if (ismember) 6280 eval.ae_options |= KAUTH_AEVAL_IN_GROUP; 6281 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; 6282 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; 6283 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS; 6284 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS; 6285 6286 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) { 6287 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error); 6288 return(error); 6289 } 6290 6291 switch(eval.ae_result) { 6292 case KAUTH_RESULT_DENY: 6293 delete_denied = 1; 6294 break; 6295 case KAUTH_RESULT_ALLOW: 6296 KAUTH_DEBUG("%p ALLOWED - granted by file ACL", vcp->vp); 6297 return(0); 6298 case KAUTH_RESULT_DEFER: 6299 default: 6300 /* Effectively the same as !delete_child_denied */ 6301 KAUTH_DEBUG("%p DEFERRED%s - by file ACL", vcp->vp, delete_denied ? "(DENY)" : ""); 6302 break; 6303 } 6304 } 6305 6306 /* if denied by ACL on directory or node, return denial */ 6307 if (delete_denied || delete_child_denied) { 6308 KAUTH_DEBUG("%p DENIED - denied by ACL", vcp->vp); 6309 return(EACCES); 6310 } 6311 6312 /* enforce sticky bit behaviour */ 6313 if ((dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) { 6314 KAUTH_DEBUG("%p DENIED - sticky bit rules (user %d file %d dir %d)", 6315 vcp->vp, cred->cr_posix.cr_uid, vap->va_uid, dvap->va_uid); 6316 return(EACCES); 6317 } 6318 6319 /* check the directory */ 6320 if (!cached_delete_child && (error = vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */)) != 0) { 6321 KAUTH_DEBUG("%p DENIED - denied by posix permisssions", vcp->vp); 6322 return(error); 6323 } 6324 6325 /* not denied, must be OK */ 6326 return(0); 6327} 6328 6329 6330/* 6331 * Authorize an operation based on the node's attributes. 6332 */ 6333static int 6334vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_rights_t preauth_rights, boolean_t *found_deny) 6335{ 6336 struct vnode_attr *vap = vcp->vap; 6337 kauth_cred_t cred = vcp->ctx->vc_ucred; 6338 struct kauth_acl_eval eval; 6339 int error, ismember; 6340 mode_t posix_action; 6341 6342 /* 6343 * If we are the file owner, we automatically have some rights. 6344 * 6345 * Do we need to expand this to support group ownership? 6346 */ 6347 if (vauth_file_owner(vcp)) 6348 acl_rights &= ~(KAUTH_VNODE_WRITE_SECURITY); 6349 6350 /* 6351 * If we are checking both TAKE_OWNERSHIP and WRITE_SECURITY, we can 6352 * mask the latter. If TAKE_OWNERSHIP is requested the caller is about to 6353 * change ownership to themselves, and WRITE_SECURITY is implicitly 6354 * granted to the owner. We need to do this because at this point 6355 * WRITE_SECURITY may not be granted as the caller is not currently 6356 * the owner. 6357 */ 6358 if ((acl_rights & KAUTH_VNODE_TAKE_OWNERSHIP) && 6359 (acl_rights & KAUTH_VNODE_WRITE_SECURITY)) 6360 acl_rights &= ~KAUTH_VNODE_WRITE_SECURITY; 6361 6362 if (acl_rights == 0) { 6363 KAUTH_DEBUG("%p ALLOWED - implicit or no rights required", vcp->vp); 6364 return(0); 6365 } 6366 6367 /* if we have an ACL, evaluate it */ 6368 if (VATTR_IS_NOT(vap, va_acl, NULL)) { 6369 eval.ae_requested = acl_rights; 6370 eval.ae_acl = &vap->va_acl->acl_ace[0]; 6371 eval.ae_count = vap->va_acl->acl_entrycount; 6372 eval.ae_options = 0; 6373 if (vauth_file_owner(vcp)) 6374 eval.ae_options |= KAUTH_AEVAL_IS_OWNER; 6375 /* 6376 * We use ENOENT as a marker to indicate we could not get 6377 * information in order to delay evaluation until after we 6378 * have the ACL evaluation answer. Previously, we would 6379 * always deny the operation at this point. 6380 */ 6381 if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) 6382 return(error); 6383 if (error == ENOENT) 6384 eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN; 6385 else if (ismember) 6386 eval.ae_options |= KAUTH_AEVAL_IN_GROUP; 6387 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; 6388 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; 6389 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS; 6390 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS; 6391 6392 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) { 6393 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error); 6394 return(error); 6395 } 6396 6397 switch(eval.ae_result) { 6398 case KAUTH_RESULT_DENY: 6399 KAUTH_DEBUG("%p DENIED - by ACL", vcp->vp); 6400 return(EACCES); /* deny, deny, counter-allege */ 6401 case KAUTH_RESULT_ALLOW: 6402 KAUTH_DEBUG("%p ALLOWED - all rights granted by ACL", vcp->vp); 6403 return(0); 6404 case KAUTH_RESULT_DEFER: 6405 default: 6406 /* Effectively the same as !delete_child_denied */ 6407 KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp); 6408 break; 6409 } 6410 6411 *found_deny = eval.ae_found_deny; 6412 6413 /* fall through and evaluate residual rights */ 6414 } else { 6415 /* no ACL, everything is residual */ 6416 eval.ae_residual = acl_rights; 6417 } 6418 6419 /* 6420 * Grant residual rights that have been pre-authorized. 6421 */ 6422 eval.ae_residual &= ~preauth_rights; 6423 6424 /* 6425 * We grant WRITE_ATTRIBUTES to the owner if it hasn't been denied. 6426 */ 6427 if (vauth_file_owner(vcp)) 6428 eval.ae_residual &= ~KAUTH_VNODE_WRITE_ATTRIBUTES; 6429 6430 if (eval.ae_residual == 0) { 6431 KAUTH_DEBUG("%p ALLOWED - rights already authorized", vcp->vp); 6432 return(0); 6433 } 6434 6435 /* 6436 * Bail if we have residual rights that can't be granted by posix permissions, 6437 * or aren't presumed granted at this point. 6438 * 6439 * XXX these can be collapsed for performance 6440 */ 6441 if (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) { 6442 KAUTH_DEBUG("%p DENIED - CHANGE_OWNER not permitted", vcp->vp); 6443 return(EACCES); 6444 } 6445 if (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) { 6446 KAUTH_DEBUG("%p DENIED - WRITE_SECURITY not permitted", vcp->vp); 6447 return(EACCES); 6448 } 6449 6450#if DIAGNOSTIC 6451 if (eval.ae_residual & KAUTH_VNODE_DELETE) 6452 panic("vnode_authorize: can't be checking delete permission here"); 6453#endif 6454 6455 /* 6456 * Compute the fallback posix permissions that will satisfy the remaining 6457 * rights. 6458 */ 6459 posix_action = 0; 6460 if (eval.ae_residual & (KAUTH_VNODE_READ_DATA | 6461 KAUTH_VNODE_LIST_DIRECTORY | 6462 KAUTH_VNODE_READ_EXTATTRIBUTES)) 6463 posix_action |= VREAD; 6464 if (eval.ae_residual & (KAUTH_VNODE_WRITE_DATA | 6465 KAUTH_VNODE_ADD_FILE | 6466 KAUTH_VNODE_ADD_SUBDIRECTORY | 6467 KAUTH_VNODE_DELETE_CHILD | 6468 KAUTH_VNODE_WRITE_ATTRIBUTES | 6469 KAUTH_VNODE_WRITE_EXTATTRIBUTES)) 6470 posix_action |= VWRITE; 6471 if (eval.ae_residual & (KAUTH_VNODE_EXECUTE | 6472 KAUTH_VNODE_SEARCH)) 6473 posix_action |= VEXEC; 6474 6475 if (posix_action != 0) { 6476 return(vnode_authorize_posix(vcp, posix_action, 0 /* !on_dir */)); 6477 } else { 6478 KAUTH_DEBUG("%p ALLOWED - residual rights %s%s%s%s%s%s%s%s%s%s%s%s%s%s granted due to no posix mapping", 6479 vcp->vp, 6480 (eval.ae_residual & KAUTH_VNODE_READ_DATA) 6481 ? vnode_isdir(vcp->vp) ? " LIST_DIRECTORY" : " READ_DATA" : "", 6482 (eval.ae_residual & KAUTH_VNODE_WRITE_DATA) 6483 ? vnode_isdir(vcp->vp) ? " ADD_FILE" : " WRITE_DATA" : "", 6484 (eval.ae_residual & KAUTH_VNODE_EXECUTE) 6485 ? vnode_isdir(vcp->vp) ? " SEARCH" : " EXECUTE" : "", 6486 (eval.ae_residual & KAUTH_VNODE_DELETE) 6487 ? " DELETE" : "", 6488 (eval.ae_residual & KAUTH_VNODE_APPEND_DATA) 6489 ? vnode_isdir(vcp->vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "", 6490 (eval.ae_residual & KAUTH_VNODE_DELETE_CHILD) 6491 ? " DELETE_CHILD" : "", 6492 (eval.ae_residual & KAUTH_VNODE_READ_ATTRIBUTES) 6493 ? " READ_ATTRIBUTES" : "", 6494 (eval.ae_residual & KAUTH_VNODE_WRITE_ATTRIBUTES) 6495 ? " WRITE_ATTRIBUTES" : "", 6496 (eval.ae_residual & KAUTH_VNODE_READ_EXTATTRIBUTES) 6497 ? " READ_EXTATTRIBUTES" : "", 6498 (eval.ae_residual & KAUTH_VNODE_WRITE_EXTATTRIBUTES) 6499 ? " WRITE_EXTATTRIBUTES" : "", 6500 (eval.ae_residual & KAUTH_VNODE_READ_SECURITY) 6501 ? " READ_SECURITY" : "", 6502 (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) 6503 ? " WRITE_SECURITY" : "", 6504 (eval.ae_residual & KAUTH_VNODE_CHECKIMMUTABLE) 6505 ? " CHECKIMMUTABLE" : "", 6506 (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) 6507 ? " CHANGE_OWNER" : ""); 6508 } 6509 6510 /* 6511 * Lack of required Posix permissions implies no reason to deny access. 6512 */ 6513 return(0); 6514} 6515 6516/* 6517 * Check for file immutability. 6518 */ 6519static int 6520vnode_authorize_checkimmutable(vnode_t vp, struct vnode_attr *vap, int rights, int ignore) 6521{ 6522 mount_t mp; 6523 int error; 6524 int append; 6525 6526 /* 6527 * Perform immutability checks for operations that change data. 6528 * 6529 * Sockets, fifos and devices require special handling. 6530 */ 6531 switch(vp->v_type) { 6532 case VSOCK: 6533 case VFIFO: 6534 case VBLK: 6535 case VCHR: 6536 /* 6537 * Writing to these nodes does not change the filesystem data, 6538 * so forget that it's being tried. 6539 */ 6540 rights &= ~KAUTH_VNODE_WRITE_DATA; 6541 break; 6542 default: 6543 break; 6544 } 6545 6546 error = 0; 6547 if (rights & KAUTH_VNODE_WRITE_RIGHTS) { 6548 6549 /* check per-filesystem options if possible */ 6550 mp = vp->v_mount; 6551 if (mp != NULL) { 6552 6553 /* check for no-EA filesystems */ 6554 if ((rights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) && 6555 (vfs_flags(mp) & MNT_NOUSERXATTR)) { 6556 KAUTH_DEBUG("%p DENIED - filesystem disallowed extended attributes", vp); 6557 error = EACCES; /* User attributes disabled */ 6558 goto out; 6559 } 6560 } 6561 6562 /* 6563 * check for file immutability. first, check if the requested rights are 6564 * allowable for a UF_APPEND file. 6565 */ 6566 append = 0; 6567 if (vp->v_type == VDIR) { 6568 if ((rights & (KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY | KAUTH_VNODE_WRITE_EXTATTRIBUTES)) == rights) 6569 append = 1; 6570 } else { 6571 if ((rights & (KAUTH_VNODE_APPEND_DATA | KAUTH_VNODE_WRITE_EXTATTRIBUTES)) == rights) 6572 append = 1; 6573 } 6574 if ((error = vnode_immutable(vap, append, ignore)) != 0) { 6575 KAUTH_DEBUG("%p DENIED - file is immutable", vp); 6576 goto out; 6577 } 6578 } 6579out: 6580 return(error); 6581} 6582 6583/* 6584 * Handle authorization actions for filesystems that advertise that the 6585 * server will be enforcing. 6586 * 6587 * Returns: 0 Authorization should be handled locally 6588 * 1 Authorization was handled by the FS 6589 * 6590 * Note: Imputed returns will only occur if the authorization request 6591 * was handled by the FS. 6592 * 6593 * Imputed: *resultp, modified Return code from FS when the request is 6594 * handled by the FS. 6595 * VNOP_ACCESS:??? 6596 * VNOP_OPEN:??? 6597 */ 6598static int 6599vnode_authorize_opaque(vnode_t vp, int *resultp, kauth_action_t action, vfs_context_t ctx) 6600{ 6601 int error; 6602 6603 /* 6604 * If the vp is a device node, socket or FIFO it actually represents a local 6605 * endpoint, so we need to handle it locally. 6606 */ 6607 switch(vp->v_type) { 6608 case VBLK: 6609 case VCHR: 6610 case VSOCK: 6611 case VFIFO: 6612 return(0); 6613 default: 6614 break; 6615 } 6616 6617 /* 6618 * In the advisory request case, if the filesystem doesn't think it's reliable 6619 * we will attempt to formulate a result ourselves based on VNOP_GETATTR data. 6620 */ 6621 if ((action & KAUTH_VNODE_ACCESS) && !vfs_authopaqueaccess(vp->v_mount)) 6622 return(0); 6623 6624 /* 6625 * Let the filesystem have a say in the matter. It's OK for it to not implemnent 6626 * VNOP_ACCESS, as most will authorise inline with the actual request. 6627 */ 6628 if ((error = VNOP_ACCESS(vp, action, ctx)) != ENOTSUP) { 6629 *resultp = error; 6630 KAUTH_DEBUG("%p DENIED - opaque filesystem VNOP_ACCESS denied access", vp); 6631 return(1); 6632 } 6633 6634 /* 6635 * Typically opaque filesystems do authorisation in-line, but exec is a special case. In 6636 * order to be reasonably sure that exec will be permitted, we try a bit harder here. 6637 */ 6638 if ((action & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG)) { 6639 /* try a VNOP_OPEN for readonly access */ 6640 if ((error = VNOP_OPEN(vp, FREAD, ctx)) != 0) { 6641 *resultp = error; 6642 KAUTH_DEBUG("%p DENIED - EXECUTE denied because file could not be opened readonly", vp); 6643 return(1); 6644 } 6645 VNOP_CLOSE(vp, FREAD, ctx); 6646 } 6647 6648 /* 6649 * We don't have any reason to believe that the request has to be denied at this point, 6650 * so go ahead and allow it. 6651 */ 6652 *resultp = 0; 6653 KAUTH_DEBUG("%p ALLOWED - bypassing access check for non-local filesystem", vp); 6654 return(1); 6655} 6656 6657 6658 6659 6660/* 6661 * Returns: KAUTH_RESULT_ALLOW 6662 * KAUTH_RESULT_DENY 6663 * 6664 * Imputed: *arg3, modified Error code in the deny case 6665 * EROFS Read-only file system 6666 * EACCES Permission denied 6667 * EPERM Operation not permitted [no execute] 6668 * vnode_getattr:ENOMEM Not enough space [only if has filesec] 6669 * vnode_getattr:??? 6670 * vnode_authorize_opaque:*arg2 ??? 6671 * vnode_authorize_checkimmutable:??? 6672 * vnode_authorize_delete:??? 6673 * vnode_authorize_simple:??? 6674 */ 6675 6676 6677static int 6678vnode_authorize_callback(kauth_cred_t cred, void *idata, kauth_action_t action, 6679 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3) 6680{ 6681 vfs_context_t ctx; 6682 vnode_t cvp = NULLVP; 6683 vnode_t vp, dvp; 6684 int result = KAUTH_RESULT_DENY; 6685 int parent_iocount = 0; 6686 int parent_action; /* In case we need to use namedstream's data fork for cached rights*/ 6687 6688 ctx = (vfs_context_t)arg0; 6689 vp = (vnode_t)arg1; 6690 dvp = (vnode_t)arg2; 6691 6692 /* 6693 * if there are 2 vnodes passed in, we don't know at 6694 * this point which rights to look at based on the 6695 * combined action being passed in... defer until later... 6696 * otherwise check the kauth 'rights' cache hung 6697 * off of the vnode we're interested in... if we've already 6698 * been granted the right we're currently interested in, 6699 * we can just return success... otherwise we'll go through 6700 * the process of authorizing the requested right(s)... if that 6701 * succeeds, we'll add the right(s) to the cache. 6702 * VNOP_SETATTR and VNOP_SETXATTR will invalidate this cache 6703 */ 6704 if (dvp && vp) 6705 goto defer; 6706 if (dvp) { 6707 cvp = dvp; 6708 } else { 6709 /* 6710 * For named streams on local-authorization volumes, rights are cached on the parent; 6711 * authorization is determined by looking at the parent's properties anyway, so storing 6712 * on the parent means that we don't recompute for the named stream and that if 6713 * we need to flush rights (e.g. on VNOP_SETATTR()) we don't need to track down the 6714 * stream to flush its cache separately. If we miss in the cache, then we authorize 6715 * as if there were no cached rights (passing the named stream vnode and desired rights to 6716 * vnode_authorize_callback_int()). 6717 * 6718 * On an opaquely authorized volume, we don't know the relationship between the 6719 * data fork's properties and the rights granted on a stream. Thus, named stream vnodes 6720 * on such a volume are authorized directly (rather than using the parent) and have their 6721 * own caches. When a named stream vnode is created, we mark the parent as having a named 6722 * stream. On a VNOP_SETATTR() for the parent that may invalidate cached authorization, we 6723 * find the stream and flush its cache. 6724 */ 6725 if (vnode_isnamedstream(vp) && (!vfs_authopaque(vp->v_mount))) { 6726 cvp = vnode_getparent(vp); 6727 if (cvp != NULLVP) { 6728 parent_iocount = 1; 6729 } else { 6730 cvp = NULL; 6731 goto defer; /* If we can't use the parent, take the slow path */ 6732 } 6733 6734 /* Have to translate some actions */ 6735 parent_action = action; 6736 if (parent_action & KAUTH_VNODE_READ_DATA) { 6737 parent_action &= ~KAUTH_VNODE_READ_DATA; 6738 parent_action |= KAUTH_VNODE_READ_EXTATTRIBUTES; 6739 } 6740 if (parent_action & KAUTH_VNODE_WRITE_DATA) { 6741 parent_action &= ~KAUTH_VNODE_WRITE_DATA; 6742 parent_action |= KAUTH_VNODE_WRITE_EXTATTRIBUTES; 6743 } 6744 6745 } else { 6746 cvp = vp; 6747 } 6748 } 6749 6750 if (vnode_cache_is_authorized(cvp, ctx, parent_iocount ? parent_action : action) == TRUE) { 6751 result = KAUTH_RESULT_ALLOW; 6752 goto out; 6753 } 6754defer: 6755 result = vnode_authorize_callback_int(cred, idata, action, arg0, arg1, arg2, arg3); 6756 6757 if (result == KAUTH_RESULT_ALLOW && cvp != NULLVP) { 6758 KAUTH_DEBUG("%p - caching action = %x", cvp, action); 6759 vnode_cache_authorized_action(cvp, ctx, action); 6760 } 6761 6762out: 6763 if (parent_iocount) { 6764 vnode_put(cvp); 6765 } 6766 6767 return result; 6768} 6769 6770 6771static int 6772vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *idata, kauth_action_t action, 6773 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3) 6774{ 6775 struct _vnode_authorize_context auth_context; 6776 vauth_ctx vcp; 6777 vfs_context_t ctx; 6778 vnode_t vp, dvp; 6779 kauth_cred_t cred; 6780 kauth_ace_rights_t rights; 6781 struct vnode_attr va, dva; 6782 int result; 6783 int *errorp; 6784 int noimmutable; 6785 boolean_t parent_authorized_for_delete_child = FALSE; 6786 boolean_t found_deny = FALSE; 6787 boolean_t parent_ref= FALSE; 6788 6789 vcp = &auth_context; 6790 ctx = vcp->ctx = (vfs_context_t)arg0; 6791 vp = vcp->vp = (vnode_t)arg1; 6792 dvp = vcp->dvp = (vnode_t)arg2; 6793 errorp = (int *)arg3; 6794 /* 6795 * Note that we authorize against the context, not the passed cred 6796 * (the same thing anyway) 6797 */ 6798 cred = ctx->vc_ucred; 6799 6800 VATTR_INIT(&va); 6801 vcp->vap = &va; 6802 VATTR_INIT(&dva); 6803 vcp->dvap = &dva; 6804 6805 vcp->flags = vcp->flags_valid = 0; 6806 6807#if DIAGNOSTIC 6808 if ((ctx == NULL) || (vp == NULL) || (cred == NULL)) 6809 panic("vnode_authorize: bad arguments (context %p vp %p cred %p)", ctx, vp, cred); 6810#endif 6811 6812 KAUTH_DEBUG("%p AUTH - %s %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s on %s '%s' (0x%x:%p/%p)", 6813 vp, vfs_context_proc(ctx)->p_comm, 6814 (action & KAUTH_VNODE_ACCESS) ? "access" : "auth", 6815 (action & KAUTH_VNODE_READ_DATA) ? vnode_isdir(vp) ? " LIST_DIRECTORY" : " READ_DATA" : "", 6816 (action & KAUTH_VNODE_WRITE_DATA) ? vnode_isdir(vp) ? " ADD_FILE" : " WRITE_DATA" : "", 6817 (action & KAUTH_VNODE_EXECUTE) ? vnode_isdir(vp) ? " SEARCH" : " EXECUTE" : "", 6818 (action & KAUTH_VNODE_DELETE) ? " DELETE" : "", 6819 (action & KAUTH_VNODE_APPEND_DATA) ? vnode_isdir(vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "", 6820 (action & KAUTH_VNODE_DELETE_CHILD) ? " DELETE_CHILD" : "", 6821 (action & KAUTH_VNODE_READ_ATTRIBUTES) ? " READ_ATTRIBUTES" : "", 6822 (action & KAUTH_VNODE_WRITE_ATTRIBUTES) ? " WRITE_ATTRIBUTES" : "", 6823 (action & KAUTH_VNODE_READ_EXTATTRIBUTES) ? " READ_EXTATTRIBUTES" : "", 6824 (action & KAUTH_VNODE_WRITE_EXTATTRIBUTES) ? " WRITE_EXTATTRIBUTES" : "", 6825 (action & KAUTH_VNODE_READ_SECURITY) ? " READ_SECURITY" : "", 6826 (action & KAUTH_VNODE_WRITE_SECURITY) ? " WRITE_SECURITY" : "", 6827 (action & KAUTH_VNODE_CHANGE_OWNER) ? " CHANGE_OWNER" : "", 6828 (action & KAUTH_VNODE_NOIMMUTABLE) ? " (noimmutable)" : "", 6829 vnode_isdir(vp) ? "directory" : "file", 6830 vp->v_name ? vp->v_name : "<NULL>", action, vp, dvp); 6831 6832 /* 6833 * Extract the control bits from the action, everything else is 6834 * requested rights. 6835 */ 6836 noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0; 6837 rights = action & ~(KAUTH_VNODE_ACCESS | KAUTH_VNODE_NOIMMUTABLE); 6838 6839 if (rights & KAUTH_VNODE_DELETE) { 6840#if DIAGNOSTIC 6841 if (dvp == NULL) 6842 panic("vnode_authorize: KAUTH_VNODE_DELETE test requires a directory"); 6843#endif 6844 /* 6845 * check to see if we've already authorized the parent 6846 * directory for deletion of its children... if so, we 6847 * can skip a whole bunch of work... we will still have to 6848 * authorize that this specific child can be removed 6849 */ 6850 if (vnode_cache_is_authorized(dvp, ctx, KAUTH_VNODE_DELETE_CHILD) == TRUE) 6851 parent_authorized_for_delete_child = TRUE; 6852 } else { 6853 dvp = NULL; 6854 } 6855 6856 /* 6857 * Check for read-only filesystems. 6858 */ 6859 if ((rights & KAUTH_VNODE_WRITE_RIGHTS) && 6860 (vp->v_mount->mnt_flag & MNT_RDONLY) && 6861 ((vp->v_type == VREG) || (vp->v_type == VDIR) || 6862 (vp->v_type == VLNK) || (vp->v_type == VCPLX) || 6863 (rights & KAUTH_VNODE_DELETE) || (rights & KAUTH_VNODE_DELETE_CHILD))) { 6864 result = EROFS; 6865 goto out; 6866 } 6867 6868 /* 6869 * Check for noexec filesystems. 6870 */ 6871 if ((rights & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG) && (vp->v_mount->mnt_flag & MNT_NOEXEC)) { 6872 result = EACCES; 6873 goto out; 6874 } 6875 6876 /* 6877 * Handle cases related to filesystems with non-local enforcement. 6878 * This call can return 0, in which case we will fall through to perform a 6879 * check based on VNOP_GETATTR data. Otherwise it returns 1 and sets 6880 * an appropriate result, at which point we can return immediately. 6881 */ 6882 if ((vp->v_mount->mnt_kern_flag & MNTK_AUTH_OPAQUE) && vnode_authorize_opaque(vp, &result, action, ctx)) 6883 goto out; 6884 6885 /* 6886 * Get vnode attributes and extended security information for the vnode 6887 * and directory if required. 6888 */ 6889 VATTR_WANTED(&va, va_mode); 6890 VATTR_WANTED(&va, va_uid); 6891 VATTR_WANTED(&va, va_gid); 6892 VATTR_WANTED(&va, va_flags); 6893 VATTR_WANTED(&va, va_acl); 6894 if ((result = vnode_getattr(vp, &va, ctx)) != 0) { 6895 KAUTH_DEBUG("%p ERROR - failed to get vnode attributes - %d", vp, result); 6896 goto out; 6897 } 6898 if (dvp) { 6899 VATTR_WANTED(&dva, va_mode); 6900 VATTR_WANTED(&dva, va_uid); 6901 VATTR_WANTED(&dva, va_gid); 6902 VATTR_WANTED(&dva, va_flags); 6903 VATTR_WANTED(&dva, va_acl); 6904 if ((result = vnode_getattr(dvp, &dva, ctx)) != 0) { 6905 KAUTH_DEBUG("%p ERROR - failed to get directory vnode attributes - %d", vp, result); 6906 goto out; 6907 } 6908 } 6909 6910 /* 6911 * If the vnode is an extended attribute data vnode (eg. a resource fork), *_DATA becomes 6912 * *_EXTATTRIBUTES. 6913 */ 6914 if (vnode_isnamedstream(vp)) { 6915 if (rights & KAUTH_VNODE_READ_DATA) { 6916 rights &= ~KAUTH_VNODE_READ_DATA; 6917 rights |= KAUTH_VNODE_READ_EXTATTRIBUTES; 6918 } 6919 if (rights & KAUTH_VNODE_WRITE_DATA) { 6920 rights &= ~KAUTH_VNODE_WRITE_DATA; 6921 rights |= KAUTH_VNODE_WRITE_EXTATTRIBUTES; 6922 } 6923 } 6924 6925 /* 6926 * Point 'vp' to the resource fork's parent for ACL checking 6927 */ 6928 if (vnode_isnamedstream(vp) && 6929 (vp->v_parent != NULL) && 6930 (vget_internal(vp->v_parent, 0, VNODE_NODEAD | VNODE_DRAINO) == 0)) { 6931 parent_ref = TRUE; 6932 vcp->vp = vp = vp->v_parent; 6933 if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) 6934 kauth_acl_free(va.va_acl); 6935 VATTR_INIT(&va); 6936 VATTR_WANTED(&va, va_mode); 6937 VATTR_WANTED(&va, va_uid); 6938 VATTR_WANTED(&va, va_gid); 6939 VATTR_WANTED(&va, va_flags); 6940 VATTR_WANTED(&va, va_acl); 6941 if ((result = vnode_getattr(vp, &va, ctx)) != 0) 6942 goto out; 6943 } 6944 6945 /* 6946 * Check for immutability. 6947 * 6948 * In the deletion case, parent directory immutability vetoes specific 6949 * file rights. 6950 */ 6951 if ((result = vnode_authorize_checkimmutable(vp, &va, rights, noimmutable)) != 0) 6952 goto out; 6953 if ((rights & KAUTH_VNODE_DELETE) && 6954 parent_authorized_for_delete_child == FALSE && 6955 ((result = vnode_authorize_checkimmutable(dvp, &dva, KAUTH_VNODE_DELETE_CHILD, 0)) != 0)) 6956 goto out; 6957 6958 /* 6959 * Clear rights that have been authorized by reaching this point, bail if nothing left to 6960 * check. 6961 */ 6962 rights &= ~(KAUTH_VNODE_LINKTARGET | KAUTH_VNODE_CHECKIMMUTABLE); 6963 if (rights == 0) 6964 goto out; 6965 6966 /* 6967 * If we're not the superuser, authorize based on file properties; 6968 * note that even if parent_authorized_for_delete_child is TRUE, we 6969 * need to check on the node itself. 6970 */ 6971 if (!vfs_context_issuser(ctx)) { 6972 /* process delete rights */ 6973 if ((rights & KAUTH_VNODE_DELETE) && 6974 ((result = vnode_authorize_delete(vcp, parent_authorized_for_delete_child)) != 0)) 6975 goto out; 6976 6977 /* process remaining rights */ 6978 if ((rights & ~KAUTH_VNODE_DELETE) && 6979 (result = vnode_authorize_simple(vcp, rights, rights & KAUTH_VNODE_DELETE, &found_deny)) != 0) 6980 goto out; 6981 } else { 6982 6983 /* 6984 * Execute is only granted to root if one of the x bits is set. This check only 6985 * makes sense if the posix mode bits are actually supported. 6986 */ 6987 if ((rights & KAUTH_VNODE_EXECUTE) && 6988 (vp->v_type == VREG) && 6989 VATTR_IS_SUPPORTED(&va, va_mode) && 6990 !(va.va_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) { 6991 result = EPERM; 6992 KAUTH_DEBUG("%p DENIED - root execute requires at least one x bit in 0x%x", vp, va.va_mode); 6993 goto out; 6994 } 6995 6996 KAUTH_DEBUG("%p ALLOWED - caller is superuser", vp); 6997 } 6998out: 6999 if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) 7000 kauth_acl_free(va.va_acl); 7001 if (VATTR_IS_SUPPORTED(&dva, va_acl) && (dva.va_acl != NULL)) 7002 kauth_acl_free(dva.va_acl); 7003 7004 if (result) { 7005 if (parent_ref) 7006 vnode_put(vp); 7007 *errorp = result; 7008 KAUTH_DEBUG("%p DENIED - auth denied", vp); 7009 return(KAUTH_RESULT_DENY); 7010 } 7011 if ((rights & KAUTH_VNODE_SEARCH) && found_deny == FALSE && vp->v_type == VDIR) { 7012 /* 7013 * if we were successfully granted the right to search this directory 7014 * and there were NO ACL DENYs for search and the posix permissions also don't 7015 * deny execute, we can synthesize a global right that allows anyone to 7016 * traverse this directory during a pathname lookup without having to 7017 * match the credential associated with this cache of rights. 7018 */ 7019 if (!VATTR_IS_SUPPORTED(&va, va_mode) || 7020 ((va.va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 7021 (S_IXUSR | S_IXGRP | S_IXOTH))) { 7022 vnode_cache_authorized_action(vp, ctx, KAUTH_VNODE_SEARCHBYANYONE); 7023 } 7024 } 7025 if ((rights & KAUTH_VNODE_DELETE) && parent_authorized_for_delete_child == FALSE) { 7026 /* 7027 * parent was successfully and newly authorized for content deletions 7028 * add it to the cache, but only if it doesn't have the sticky 7029 * bit set on it. This same check is done earlier guarding 7030 * fetching of dva, and if we jumped to out without having done 7031 * this, we will have returned already because of a non-zero 7032 * 'result' value. 7033 */ 7034 if (VATTR_IS_SUPPORTED(&dva, va_mode) && 7035 !(dva.va_mode & (S_ISVTX))) { 7036 /* OK to cache delete rights */ 7037 KAUTH_DEBUG("%p - caching DELETE_CHILD rights", dvp); 7038 vnode_cache_authorized_action(dvp, ctx, KAUTH_VNODE_DELETE_CHILD); 7039 } 7040 } 7041 if (parent_ref) 7042 vnode_put(vp); 7043 /* 7044 * Note that this implies that we will allow requests for no rights, as well as 7045 * for rights that we do not recognise. There should be none of these. 7046 */ 7047 KAUTH_DEBUG("%p ALLOWED - auth granted", vp); 7048 return(KAUTH_RESULT_ALLOW); 7049} 7050 7051int 7052vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_t ctx) 7053{ 7054 return vnode_authattr_new_internal(dvp, vap, noauth, NULL, ctx); 7055} 7056 7057/* 7058 * Check that the attribute information in vattr can be legally applied to 7059 * a new file by the context. 7060 */ 7061static int 7062vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx) 7063{ 7064 int error; 7065 int has_priv_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode; 7066 kauth_cred_t cred; 7067 guid_t changer; 7068 mount_t dmp; 7069 7070 error = 0; 7071 7072 if (defaulted_fieldsp) { 7073 *defaulted_fieldsp = 0; 7074 } 7075 7076 defaulted_owner = defaulted_group = defaulted_mode = 0; 7077 7078 /* 7079 * Require that the filesystem support extended security to apply any. 7080 */ 7081 if (!vfs_extendedsecurity(dvp->v_mount) && 7082 (VATTR_IS_ACTIVE(vap, va_acl) || VATTR_IS_ACTIVE(vap, va_uuuid) || VATTR_IS_ACTIVE(vap, va_guuid))) { 7083 error = EINVAL; 7084 goto out; 7085 } 7086 7087 /* 7088 * Default some fields. 7089 */ 7090 dmp = dvp->v_mount; 7091 7092 /* 7093 * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit owner is set, that 7094 * owner takes ownership of all new files. 7095 */ 7096 if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsowner != KAUTH_UID_NONE)) { 7097 VATTR_SET(vap, va_uid, dmp->mnt_fsowner); 7098 defaulted_owner = 1; 7099 } else { 7100 if (!VATTR_IS_ACTIVE(vap, va_uid)) { 7101 /* default owner is current user */ 7102 VATTR_SET(vap, va_uid, kauth_cred_getuid(vfs_context_ucred(ctx))); 7103 defaulted_owner = 1; 7104 } 7105 } 7106 7107 /* 7108 * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit grouo is set, that 7109 * group takes ownership of all new files. 7110 */ 7111 if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsgroup != KAUTH_GID_NONE)) { 7112 VATTR_SET(vap, va_gid, dmp->mnt_fsgroup); 7113 defaulted_group = 1; 7114 } else { 7115 if (!VATTR_IS_ACTIVE(vap, va_gid)) { 7116 /* default group comes from parent object, fallback to current user */ 7117 struct vnode_attr dva; 7118 VATTR_INIT(&dva); 7119 VATTR_WANTED(&dva, va_gid); 7120 if ((error = vnode_getattr(dvp, &dva, ctx)) != 0) 7121 goto out; 7122 if (VATTR_IS_SUPPORTED(&dva, va_gid)) { 7123 VATTR_SET(vap, va_gid, dva.va_gid); 7124 } else { 7125 VATTR_SET(vap, va_gid, kauth_cred_getgid(vfs_context_ucred(ctx))); 7126 } 7127 defaulted_group = 1; 7128 } 7129 } 7130 7131 if (!VATTR_IS_ACTIVE(vap, va_flags)) 7132 VATTR_SET(vap, va_flags, 0); 7133 7134 /* default mode is everything, masked with current umask */ 7135 if (!VATTR_IS_ACTIVE(vap, va_mode)) { 7136 VATTR_SET(vap, va_mode, ACCESSPERMS & ~vfs_context_proc(ctx)->p_fd->fd_cmask); 7137 KAUTH_DEBUG("ATTR - defaulting new file mode to %o from umask %o", vap->va_mode, vfs_context_proc(ctx)->p_fd->fd_cmask); 7138 defaulted_mode = 1; 7139 } 7140 /* set timestamps to now */ 7141 if (!VATTR_IS_ACTIVE(vap, va_create_time)) { 7142 nanotime(&vap->va_create_time); 7143 VATTR_SET_ACTIVE(vap, va_create_time); 7144 } 7145 7146 /* 7147 * Check for attempts to set nonsensical fields. 7148 */ 7149 if (vap->va_active & ~VNODE_ATTR_NEWOBJ) { 7150 error = EINVAL; 7151 KAUTH_DEBUG("ATTR - ERROR - attempt to set unsupported new-file attributes %llx", 7152 vap->va_active & ~VNODE_ATTR_NEWOBJ); 7153 goto out; 7154 } 7155 7156 /* 7157 * Quickly check for the applicability of any enforcement here. 7158 * Tests below maintain the integrity of the local security model. 7159 */ 7160 if (vfs_authopaque(dvp->v_mount)) 7161 goto out; 7162 7163 /* 7164 * We need to know if the caller is the superuser, or if the work is 7165 * otherwise already authorised. 7166 */ 7167 cred = vfs_context_ucred(ctx); 7168 if (noauth) { 7169 /* doing work for the kernel */ 7170 has_priv_suser = 1; 7171 } else { 7172 has_priv_suser = vfs_context_issuser(ctx); 7173 } 7174 7175 7176 if (VATTR_IS_ACTIVE(vap, va_flags)) { 7177 if (has_priv_suser) { 7178 if ((vap->va_flags & (UF_SETTABLE | SF_SETTABLE)) != vap->va_flags) { 7179 error = EPERM; 7180 KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)"); 7181 goto out; 7182 } 7183 } else { 7184 if ((vap->va_flags & UF_SETTABLE) != vap->va_flags) { 7185 error = EPERM; 7186 KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)"); 7187 goto out; 7188 } 7189 } 7190 } 7191 7192 /* if not superuser, validate legality of new-item attributes */ 7193 if (!has_priv_suser) { 7194 if (!defaulted_mode && VATTR_IS_ACTIVE(vap, va_mode)) { 7195 /* setgid? */ 7196 if (vap->va_mode & S_ISGID) { 7197 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) { 7198 KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid); 7199 goto out; 7200 } 7201 if (!ismember) { 7202 KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", vap->va_gid); 7203 error = EPERM; 7204 goto out; 7205 } 7206 } 7207 7208 /* setuid? */ 7209 if ((vap->va_mode & S_ISUID) && (vap->va_uid != kauth_cred_getuid(cred))) { 7210 KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit"); 7211 error = EPERM; 7212 goto out; 7213 } 7214 } 7215 if (!defaulted_owner && (vap->va_uid != kauth_cred_getuid(cred))) { 7216 KAUTH_DEBUG(" DENIED - cannot create new item owned by %d", vap->va_uid); 7217 error = EPERM; 7218 goto out; 7219 } 7220 if (!defaulted_group) { 7221 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) { 7222 KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid); 7223 goto out; 7224 } 7225 if (!ismember) { 7226 KAUTH_DEBUG(" DENIED - cannot create new item with group %d - not a member", vap->va_gid); 7227 error = EPERM; 7228 goto out; 7229 } 7230 } 7231 7232 /* initialising owner/group UUID */ 7233 if (VATTR_IS_ACTIVE(vap, va_uuuid)) { 7234 if ((error = kauth_cred_getguid(cred, &changer)) != 0) { 7235 KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error); 7236 /* XXX ENOENT here - no GUID - should perhaps become EPERM */ 7237 goto out; 7238 } 7239 if (!kauth_guid_equal(&vap->va_uuuid, &changer)) { 7240 KAUTH_DEBUG(" ERROR - cannot create item with supplied owner UUID - not us"); 7241 error = EPERM; 7242 goto out; 7243 } 7244 } 7245 if (VATTR_IS_ACTIVE(vap, va_guuid)) { 7246 if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) { 7247 KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error); 7248 goto out; 7249 } 7250 if (!ismember) { 7251 KAUTH_DEBUG(" ERROR - cannot create item with supplied group UUID - not a member"); 7252 error = EPERM; 7253 goto out; 7254 } 7255 } 7256 } 7257out: 7258 if (defaulted_fieldsp) { 7259 if (defaulted_mode) { 7260 *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_MODE; 7261 } 7262 if (defaulted_group) { 7263 *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_GID; 7264 } 7265 if (defaulted_owner) { 7266 *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_UID; 7267 } 7268 } 7269 return(error); 7270} 7271 7272/* 7273 * Check that the attribute information in vap can be legally written by the 7274 * context. 7275 * 7276 * Call this when you're not sure about the vnode_attr; either its contents 7277 * have come from an unknown source, or when they are variable. 7278 * 7279 * Returns errno, or zero and sets *actionp to the KAUTH_VNODE_* actions that 7280 * must be authorized to be permitted to write the vattr. 7281 */ 7282int 7283vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_context_t ctx) 7284{ 7285 struct vnode_attr ova; 7286 kauth_action_t required_action; 7287 int error, has_priv_suser, ismember, chowner, chgroup, clear_suid, clear_sgid; 7288 guid_t changer; 7289 gid_t group; 7290 uid_t owner; 7291 mode_t newmode; 7292 kauth_cred_t cred; 7293 uint32_t fdelta; 7294 7295 VATTR_INIT(&ova); 7296 required_action = 0; 7297 error = 0; 7298 7299 /* 7300 * Quickly check for enforcement applicability. 7301 */ 7302 if (vfs_authopaque(vp->v_mount)) 7303 goto out; 7304 7305 /* 7306 * Check for attempts to set nonsensical fields. 7307 */ 7308 if (vap->va_active & VNODE_ATTR_RDONLY) { 7309 KAUTH_DEBUG("ATTR - ERROR: attempt to set readonly attribute(s)"); 7310 error = EINVAL; 7311 goto out; 7312 } 7313 7314 /* 7315 * We need to know if the caller is the superuser. 7316 */ 7317 cred = vfs_context_ucred(ctx); 7318 has_priv_suser = kauth_cred_issuser(cred); 7319 7320 /* 7321 * If any of the following are changing, we need information from the old file: 7322 * va_uid 7323 * va_gid 7324 * va_mode 7325 * va_uuuid 7326 * va_guuid 7327 */ 7328 if (VATTR_IS_ACTIVE(vap, va_uid) || 7329 VATTR_IS_ACTIVE(vap, va_gid) || 7330 VATTR_IS_ACTIVE(vap, va_mode) || 7331 VATTR_IS_ACTIVE(vap, va_uuuid) || 7332 VATTR_IS_ACTIVE(vap, va_guuid)) { 7333 VATTR_WANTED(&ova, va_mode); 7334 VATTR_WANTED(&ova, va_uid); 7335 VATTR_WANTED(&ova, va_gid); 7336 VATTR_WANTED(&ova, va_uuuid); 7337 VATTR_WANTED(&ova, va_guuid); 7338 KAUTH_DEBUG("ATTR - security information changing, fetching existing attributes"); 7339 } 7340 7341 /* 7342 * If timestamps are being changed, we need to know who the file is owned 7343 * by. 7344 */ 7345 if (VATTR_IS_ACTIVE(vap, va_create_time) || 7346 VATTR_IS_ACTIVE(vap, va_change_time) || 7347 VATTR_IS_ACTIVE(vap, va_modify_time) || 7348 VATTR_IS_ACTIVE(vap, va_access_time) || 7349 VATTR_IS_ACTIVE(vap, va_backup_time)) { 7350 7351 VATTR_WANTED(&ova, va_uid); 7352#if 0 /* enable this when we support UUIDs as official owners */ 7353 VATTR_WANTED(&ova, va_uuuid); 7354#endif 7355 KAUTH_DEBUG("ATTR - timestamps changing, fetching uid and GUID"); 7356 } 7357 7358 /* 7359 * If flags are being changed, we need the old flags. 7360 */ 7361 if (VATTR_IS_ACTIVE(vap, va_flags)) { 7362 KAUTH_DEBUG("ATTR - flags changing, fetching old flags"); 7363 VATTR_WANTED(&ova, va_flags); 7364 } 7365 7366 /* 7367 * If ACLs are being changed, we need the old ACLs. 7368 */ 7369 if (VATTR_IS_ACTIVE(vap, va_acl)) { 7370 KAUTH_DEBUG("ATTR - acl changing, fetching old flags"); 7371 VATTR_WANTED(&ova, va_acl); 7372 } 7373 7374 /* 7375 * If the size is being set, make sure it's not a directory. 7376 */ 7377 if (VATTR_IS_ACTIVE(vap, va_data_size)) { 7378 /* size is meaningless on a directory, don't permit this */ 7379 if (vnode_isdir(vp)) { 7380 KAUTH_DEBUG("ATTR - ERROR: size change requested on a directory"); 7381 error = EISDIR; 7382 goto out; 7383 } 7384 } 7385 7386 /* 7387 * Get old data. 7388 */ 7389 KAUTH_DEBUG("ATTR - fetching old attributes %016llx", ova.va_active); 7390 if ((error = vnode_getattr(vp, &ova, ctx)) != 0) { 7391 KAUTH_DEBUG(" ERROR - got %d trying to get attributes", error); 7392 goto out; 7393 } 7394 7395 /* 7396 * Size changes require write access to the file data. 7397 */ 7398 if (VATTR_IS_ACTIVE(vap, va_data_size)) { 7399 /* if we can't get the size, or it's different, we need write access */ 7400 KAUTH_DEBUG("ATTR - size change, requiring WRITE_DATA"); 7401 required_action |= KAUTH_VNODE_WRITE_DATA; 7402 } 7403 7404 /* 7405 * Changing timestamps? 7406 * 7407 * Note that we are only called to authorize user-requested time changes; 7408 * side-effect time changes are not authorized. Authorisation is only 7409 * required for existing files. 7410 * 7411 * Non-owners are not permitted to change the time on an existing 7412 * file to anything other than the current time. 7413 */ 7414 if (VATTR_IS_ACTIVE(vap, va_create_time) || 7415 VATTR_IS_ACTIVE(vap, va_change_time) || 7416 VATTR_IS_ACTIVE(vap, va_modify_time) || 7417 VATTR_IS_ACTIVE(vap, va_access_time) || 7418 VATTR_IS_ACTIVE(vap, va_backup_time)) { 7419 /* 7420 * The owner and root may set any timestamps they like, 7421 * provided that the file is not immutable. The owner still needs 7422 * WRITE_ATTRIBUTES (implied by ownership but still deniable). 7423 */ 7424 if (has_priv_suser || vauth_node_owner(&ova, cred)) { 7425 KAUTH_DEBUG("ATTR - root or owner changing timestamps"); 7426 required_action |= KAUTH_VNODE_CHECKIMMUTABLE | KAUTH_VNODE_WRITE_ATTRIBUTES; 7427 } else { 7428 /* just setting the current time? */ 7429 if (vap->va_vaflags & VA_UTIMES_NULL) { 7430 KAUTH_DEBUG("ATTR - non-root/owner changing timestamps, requiring WRITE_ATTRIBUTES"); 7431 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES; 7432 } else { 7433 KAUTH_DEBUG("ATTR - ERROR: illegal timestamp modification attempted"); 7434 error = EACCES; 7435 goto out; 7436 } 7437 } 7438 } 7439 7440 /* 7441 * Changing file mode? 7442 */ 7443 if (VATTR_IS_ACTIVE(vap, va_mode) && VATTR_IS_SUPPORTED(&ova, va_mode) && (ova.va_mode != vap->va_mode)) { 7444 KAUTH_DEBUG("ATTR - mode change from %06o to %06o", ova.va_mode, vap->va_mode); 7445 7446 /* 7447 * Mode changes always have the same basic auth requirements. 7448 */ 7449 if (has_priv_suser) { 7450 KAUTH_DEBUG("ATTR - superuser mode change, requiring immutability check"); 7451 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; 7452 } else { 7453 /* need WRITE_SECURITY */ 7454 KAUTH_DEBUG("ATTR - non-superuser mode change, requiring WRITE_SECURITY"); 7455 required_action |= KAUTH_VNODE_WRITE_SECURITY; 7456 } 7457 7458 /* 7459 * Can't set the setgid bit if you're not in the group and not root. Have to have 7460 * existing group information in the case we're not setting it right now. 7461 */ 7462 if (vap->va_mode & S_ISGID) { 7463 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */ 7464 if (!has_priv_suser) { 7465 if (VATTR_IS_ACTIVE(vap, va_gid)) { 7466 group = vap->va_gid; 7467 } else if (VATTR_IS_SUPPORTED(&ova, va_gid)) { 7468 group = ova.va_gid; 7469 } else { 7470 KAUTH_DEBUG("ATTR - ERROR: setgid but no gid available"); 7471 error = EINVAL; 7472 goto out; 7473 } 7474 /* 7475 * This might be too restrictive; WRITE_SECURITY might be implied by 7476 * membership in this case, rather than being an additional requirement. 7477 */ 7478 if ((error = kauth_cred_ismember_gid(cred, group, &ismember)) != 0) { 7479 KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid); 7480 goto out; 7481 } 7482 if (!ismember) { 7483 KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", group); 7484 error = EPERM; 7485 goto out; 7486 } 7487 } 7488 } 7489 7490 /* 7491 * Can't set the setuid bit unless you're root or the file's owner. 7492 */ 7493 if (vap->va_mode & S_ISUID) { 7494 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */ 7495 if (!has_priv_suser) { 7496 if (VATTR_IS_ACTIVE(vap, va_uid)) { 7497 owner = vap->va_uid; 7498 } else if (VATTR_IS_SUPPORTED(&ova, va_uid)) { 7499 owner = ova.va_uid; 7500 } else { 7501 KAUTH_DEBUG("ATTR - ERROR: setuid but no uid available"); 7502 error = EINVAL; 7503 goto out; 7504 } 7505 if (owner != kauth_cred_getuid(cred)) { 7506 /* 7507 * We could allow this if WRITE_SECURITY is permitted, perhaps. 7508 */ 7509 KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit"); 7510 error = EPERM; 7511 goto out; 7512 } 7513 } 7514 } 7515 } 7516 7517 /* 7518 * Validate/mask flags changes. This checks that only the flags in 7519 * the UF_SETTABLE mask are being set, and preserves the flags in 7520 * the SF_SETTABLE case. 7521 * 7522 * Since flags changes may be made in conjunction with other changes, 7523 * we will ask the auth code to ignore immutability in the case that 7524 * the SF_* flags are not set and we are only manipulating the file flags. 7525 * 7526 */ 7527 if (VATTR_IS_ACTIVE(vap, va_flags)) { 7528 /* compute changing flags bits */ 7529 if (VATTR_IS_SUPPORTED(&ova, va_flags)) { 7530 fdelta = vap->va_flags ^ ova.va_flags; 7531 } else { 7532 fdelta = vap->va_flags; 7533 } 7534 7535 if (fdelta != 0) { 7536 KAUTH_DEBUG("ATTR - flags changing, requiring WRITE_SECURITY"); 7537 required_action |= KAUTH_VNODE_WRITE_SECURITY; 7538 7539 /* check that changing bits are legal */ 7540 if (has_priv_suser) { 7541 /* 7542 * The immutability check will prevent us from clearing the SF_* 7543 * flags unless the system securelevel permits it, so just check 7544 * for legal flags here. 7545 */ 7546 if (fdelta & ~(UF_SETTABLE | SF_SETTABLE)) { 7547 error = EPERM; 7548 KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)"); 7549 goto out; 7550 } 7551 } else { 7552 if (fdelta & ~UF_SETTABLE) { 7553 error = EPERM; 7554 KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)"); 7555 goto out; 7556 } 7557 } 7558 /* 7559 * If the caller has the ability to manipulate file flags, 7560 * security is not reduced by ignoring them for this operation. 7561 * 7562 * A more complete test here would consider the 'after' states of the flags 7563 * to determine whether it would permit the operation, but this becomes 7564 * very complex. 7565 * 7566 * Ignoring immutability is conditional on securelevel; this does not bypass 7567 * the SF_* flags if securelevel > 0. 7568 */ 7569 required_action |= KAUTH_VNODE_NOIMMUTABLE; 7570 } 7571 } 7572 7573 /* 7574 * Validate ownership information. 7575 */ 7576 chowner = 0; 7577 chgroup = 0; 7578 clear_suid = 0; 7579 clear_sgid = 0; 7580 7581 /* 7582 * uid changing 7583 * Note that if the filesystem didn't give us a UID, we expect that it doesn't 7584 * support them in general, and will ignore it if/when we try to set it. 7585 * We might want to clear the uid out of vap completely here. 7586 */ 7587 if (VATTR_IS_ACTIVE(vap, va_uid)) { 7588 if (VATTR_IS_SUPPORTED(&ova, va_uid) && (vap->va_uid != ova.va_uid)) { 7589 if (!has_priv_suser && (kauth_cred_getuid(cred) != vap->va_uid)) { 7590 KAUTH_DEBUG(" DENIED - non-superuser cannot change ownershipt to a third party"); 7591 error = EPERM; 7592 goto out; 7593 } 7594 chowner = 1; 7595 } 7596 clear_suid = 1; 7597 } 7598 7599 /* 7600 * gid changing 7601 * Note that if the filesystem didn't give us a GID, we expect that it doesn't 7602 * support them in general, and will ignore it if/when we try to set it. 7603 * We might want to clear the gid out of vap completely here. 7604 */ 7605 if (VATTR_IS_ACTIVE(vap, va_gid)) { 7606 if (VATTR_IS_SUPPORTED(&ova, va_gid) && (vap->va_gid != ova.va_gid)) { 7607 if (!has_priv_suser) { 7608 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) { 7609 KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid); 7610 goto out; 7611 } 7612 if (!ismember) { 7613 KAUTH_DEBUG(" DENIED - group change from %d to %d but not a member of target group", 7614 ova.va_gid, vap->va_gid); 7615 error = EPERM; 7616 goto out; 7617 } 7618 } 7619 chgroup = 1; 7620 } 7621 clear_sgid = 1; 7622 } 7623 7624 /* 7625 * Owner UUID being set or changed. 7626 */ 7627 if (VATTR_IS_ACTIVE(vap, va_uuuid)) { 7628 /* if the owner UUID is not actually changing ... */ 7629 if (VATTR_IS_SUPPORTED(&ova, va_uuuid)) { 7630 if (kauth_guid_equal(&vap->va_uuuid, &ova.va_uuuid)) 7631 goto no_uuuid_change; 7632 7633 /* 7634 * If the current owner UUID is a null GUID, check 7635 * it against the UUID corresponding to the owner UID. 7636 */ 7637 if (kauth_guid_equal(&ova.va_uuuid, &kauth_null_guid) && 7638 VATTR_IS_SUPPORTED(&ova, va_uid)) { 7639 guid_t uid_guid; 7640 7641 if (kauth_cred_uid2guid(ova.va_uid, &uid_guid) == 0 && 7642 kauth_guid_equal(&vap->va_uuuid, &uid_guid)) 7643 goto no_uuuid_change; 7644 } 7645 } 7646 7647 /* 7648 * The owner UUID cannot be set by a non-superuser to anything other than 7649 * their own or a null GUID (to "unset" the owner UUID). 7650 * Note that file systems must be prepared to handle the 7651 * null UUID case in a manner appropriate for that file 7652 * system. 7653 */ 7654 if (!has_priv_suser) { 7655 if ((error = kauth_cred_getguid(cred, &changer)) != 0) { 7656 KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error); 7657 /* XXX ENOENT here - no UUID - should perhaps become EPERM */ 7658 goto out; 7659 } 7660 if (!kauth_guid_equal(&vap->va_uuuid, &changer) && 7661 !kauth_guid_equal(&vap->va_uuuid, &kauth_null_guid)) { 7662 KAUTH_DEBUG(" ERROR - cannot set supplied owner UUID - not us / null"); 7663 error = EPERM; 7664 goto out; 7665 } 7666 } 7667 chowner = 1; 7668 clear_suid = 1; 7669 } 7670no_uuuid_change: 7671 /* 7672 * Group UUID being set or changed. 7673 */ 7674 if (VATTR_IS_ACTIVE(vap, va_guuid)) { 7675 /* if the group UUID is not actually changing ... */ 7676 if (VATTR_IS_SUPPORTED(&ova, va_guuid)) { 7677 if (kauth_guid_equal(&vap->va_guuid, &ova.va_guuid)) 7678 goto no_guuid_change; 7679 7680 /* 7681 * If the current group UUID is a null UUID, check 7682 * it against the UUID corresponding to the group GID. 7683 */ 7684 if (kauth_guid_equal(&ova.va_guuid, &kauth_null_guid) && 7685 VATTR_IS_SUPPORTED(&ova, va_gid)) { 7686 guid_t gid_guid; 7687 7688 if (kauth_cred_gid2guid(ova.va_gid, &gid_guid) == 0 && 7689 kauth_guid_equal(&vap->va_guuid, &gid_guid)) 7690 goto no_guuid_change; 7691 } 7692 } 7693 7694 /* 7695 * The group UUID cannot be set by a non-superuser to anything other than 7696 * one of which they are a member or a null GUID (to "unset" 7697 * the group UUID). 7698 * Note that file systems must be prepared to handle the 7699 * null UUID case in a manner appropriate for that file 7700 * system. 7701 */ 7702 if (!has_priv_suser) { 7703 if (kauth_guid_equal(&vap->va_guuid, &kauth_null_guid)) 7704 ismember = 1; 7705 else if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) { 7706 KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error); 7707 goto out; 7708 } 7709 if (!ismember) { 7710 KAUTH_DEBUG(" ERROR - cannot set supplied group UUID - not a member / null"); 7711 error = EPERM; 7712 goto out; 7713 } 7714 } 7715 chgroup = 1; 7716 } 7717no_guuid_change: 7718 7719 /* 7720 * Compute authorisation for group/ownership changes. 7721 */ 7722 if (chowner || chgroup || clear_suid || clear_sgid) { 7723 if (has_priv_suser) { 7724 KAUTH_DEBUG("ATTR - superuser changing file owner/group, requiring immutability check"); 7725 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; 7726 } else { 7727 if (chowner) { 7728 KAUTH_DEBUG("ATTR - ownership change, requiring TAKE_OWNERSHIP"); 7729 required_action |= KAUTH_VNODE_TAKE_OWNERSHIP; 7730 } 7731 if (chgroup && !chowner) { 7732 KAUTH_DEBUG("ATTR - group change, requiring WRITE_SECURITY"); 7733 required_action |= KAUTH_VNODE_WRITE_SECURITY; 7734 } 7735 7736 /* clear set-uid and set-gid bits as required by Posix */ 7737 if (VATTR_IS_ACTIVE(vap, va_mode)) { 7738 newmode = vap->va_mode; 7739 } else if (VATTR_IS_SUPPORTED(&ova, va_mode)) { 7740 newmode = ova.va_mode; 7741 } else { 7742 KAUTH_DEBUG("CHOWN - trying to change owner but cannot get mode from filesystem to mask setugid bits"); 7743 newmode = 0; 7744 } 7745 if (newmode & (S_ISUID | S_ISGID)) { 7746 VATTR_SET(vap, va_mode, newmode & ~(S_ISUID | S_ISGID)); 7747 KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o", newmode, vap->va_mode); 7748 } 7749 } 7750 } 7751 7752 /* 7753 * Authorise changes in the ACL. 7754 */ 7755 if (VATTR_IS_ACTIVE(vap, va_acl)) { 7756 7757 /* no existing ACL */ 7758 if (!VATTR_IS_ACTIVE(&ova, va_acl) || (ova.va_acl == NULL)) { 7759 7760 /* adding an ACL */ 7761 if (vap->va_acl != NULL) { 7762 required_action |= KAUTH_VNODE_WRITE_SECURITY; 7763 KAUTH_DEBUG("CHMOD - adding ACL"); 7764 } 7765 7766 /* removing an existing ACL */ 7767 } else if (vap->va_acl == NULL) { 7768 required_action |= KAUTH_VNODE_WRITE_SECURITY; 7769 KAUTH_DEBUG("CHMOD - removing ACL"); 7770 7771 /* updating an existing ACL */ 7772 } else { 7773 if (vap->va_acl->acl_entrycount != ova.va_acl->acl_entrycount) { 7774 /* entry count changed, must be different */ 7775 required_action |= KAUTH_VNODE_WRITE_SECURITY; 7776 KAUTH_DEBUG("CHMOD - adding/removing ACL entries"); 7777 } else if (vap->va_acl->acl_entrycount > 0) { 7778 /* both ACLs have the same ACE count, said count is 1 or more, bitwise compare ACLs */ 7779 if (memcmp(&vap->va_acl->acl_ace[0], &ova.va_acl->acl_ace[0], 7780 sizeof(struct kauth_ace) * vap->va_acl->acl_entrycount)) { 7781 required_action |= KAUTH_VNODE_WRITE_SECURITY; 7782 KAUTH_DEBUG("CHMOD - changing ACL entries"); 7783 } 7784 } 7785 } 7786 } 7787 7788 /* 7789 * Other attributes that require authorisation. 7790 */ 7791 if (VATTR_IS_ACTIVE(vap, va_encoding)) 7792 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES; 7793 7794out: 7795 if (VATTR_IS_SUPPORTED(&ova, va_acl) && (ova.va_acl != NULL)) 7796 kauth_acl_free(ova.va_acl); 7797 if (error == 0) 7798 *actionp = required_action; 7799 return(error); 7800} 7801 7802static int 7803setlocklocal_callback(struct vnode *vp, __unused void *cargs) 7804{ 7805 vnode_lock_spin(vp); 7806 vp->v_flag |= VLOCKLOCAL; 7807 vnode_unlock(vp); 7808 7809 return (VNODE_RETURNED); 7810} 7811 7812void 7813vfs_setlocklocal(mount_t mp) 7814{ 7815 mount_lock_spin(mp); 7816 mp->mnt_kern_flag |= MNTK_LOCK_LOCAL; 7817 mount_unlock(mp); 7818 7819 /* 7820 * The number of active vnodes is expected to be 7821 * very small when vfs_setlocklocal is invoked. 7822 */ 7823 vnode_iterate(mp, 0, setlocklocal_callback, NULL); 7824} 7825 7826void 7827vfs_setunmountpreflight(mount_t mp) 7828{ 7829 mount_lock_spin(mp); 7830 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT; 7831 mount_unlock(mp); 7832} 7833 7834void 7835vfs_setcompoundopen(mount_t mp) 7836{ 7837 mount_lock_spin(mp); 7838 mp->mnt_compound_ops |= COMPOUND_VNOP_OPEN; 7839 mount_unlock(mp); 7840} 7841 7842void 7843vn_setunionwait(vnode_t vp) 7844{ 7845 vnode_lock_spin(vp); 7846 vp->v_flag |= VISUNION; 7847 vnode_unlock(vp); 7848} 7849 7850 7851void 7852vn_checkunionwait(vnode_t vp) 7853{ 7854 vnode_lock_spin(vp); 7855 while ((vp->v_flag & VISUNION) == VISUNION) 7856 msleep((caddr_t)&vp->v_flag, &vp->v_lock, 0, 0, 0); 7857 vnode_unlock(vp); 7858} 7859 7860void 7861vn_clearunionwait(vnode_t vp, int locked) 7862{ 7863 if (!locked) 7864 vnode_lock_spin(vp); 7865 if((vp->v_flag & VISUNION) == VISUNION) { 7866 vp->v_flag &= ~VISUNION; 7867 wakeup((caddr_t)&vp->v_flag); 7868 } 7869 if (!locked) 7870 vnode_unlock(vp); 7871} 7872 7873/* 7874 * XXX - get "don't trigger mounts" flag for thread; used by autofs. 7875 */ 7876extern int thread_notrigger(void); 7877 7878int 7879thread_notrigger(void) 7880{ 7881 struct uthread *uth = (struct uthread *)get_bsdthread_info(current_thread()); 7882 return (uth->uu_notrigger); 7883} 7884 7885/* 7886 * Removes orphaned apple double files during a rmdir 7887 * Works by: 7888 * 1. vnode_suspend(). 7889 * 2. Call VNOP_READDIR() till the end of directory is reached. 7890 * 3. Check if the directory entries returned are regular files with name starting with "._". If not, return ENOTEMPTY. 7891 * 4. Continue (2) and (3) till end of directory is reached. 7892 * 5. If all the entries in the directory were files with "._" name, delete all the files. 7893 * 6. vnode_resume() 7894 * 7. If deletion of all files succeeded, call VNOP_RMDIR() again. 7895 */ 7896 7897errno_t rmdir_remove_orphaned_appleDouble(vnode_t vp , vfs_context_t ctx, int * restart_flag) 7898{ 7899 7900#define UIO_BUFF_SIZE 2048 7901 uio_t auio = NULL; 7902 int eofflag, siz = UIO_BUFF_SIZE, nentries = 0; 7903 int open_flag = 0, full_erase_flag = 0; 7904 char uio_buf[ UIO_SIZEOF(1) ]; 7905 char *rbuf = NULL, *cpos, *cend; 7906 struct nameidata nd_temp; 7907 struct dirent *dp; 7908 errno_t error; 7909 7910 error = vnode_suspend(vp); 7911 7912 /* 7913 * restart_flag is set so that the calling rmdir sleeps and resets 7914 */ 7915 if (error == EBUSY) 7916 *restart_flag = 1; 7917 if (error != 0) 7918 goto outsc; 7919 7920 /* 7921 * set up UIO 7922 */ 7923 MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK); 7924 if (rbuf) 7925 auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, 7926 &uio_buf[0], sizeof(uio_buf)); 7927 if (!rbuf || !auio) { 7928 error = ENOMEM; 7929 goto outsc; 7930 } 7931 7932 uio_setoffset(auio,0); 7933 7934 eofflag = 0; 7935 7936 if ((error = VNOP_OPEN(vp, FREAD, ctx))) 7937 goto outsc; 7938 else 7939 open_flag = 1; 7940 7941 /* 7942 * First pass checks if all files are appleDouble files. 7943 */ 7944 7945 do { 7946 siz = UIO_BUFF_SIZE; 7947 uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ); 7948 uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE); 7949 7950 if((error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx))) 7951 goto outsc; 7952 7953 if (uio_resid(auio) != 0) 7954 siz -= uio_resid(auio); 7955 7956 /* 7957 * Iterate through directory 7958 */ 7959 cpos = rbuf; 7960 cend = rbuf + siz; 7961 dp = (struct dirent*) cpos; 7962 7963 if (cpos == cend) 7964 eofflag = 1; 7965 7966 while ((cpos < cend)) { 7967 /* 7968 * Check for . and .. as well as directories 7969 */ 7970 if (dp->d_ino != 0 && 7971 !((dp->d_namlen == 1 && dp->d_name[0] == '.') || 7972 (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'))) { 7973 /* 7974 * Check for irregular files and ._ files 7975 * If there is a ._._ file abort the op 7976 */ 7977 if ( dp->d_namlen < 2 || 7978 strncmp(dp->d_name,"._",2) || 7979 (dp->d_namlen >= 4 && !strncmp(&(dp->d_name[2]), "._",2))) { 7980 error = ENOTEMPTY; 7981 goto outsc; 7982 } 7983 } 7984 cpos += dp->d_reclen; 7985 dp = (struct dirent*)cpos; 7986 } 7987 7988 /* 7989 * workaround for HFS/NFS setting eofflag before end of file 7990 */ 7991 if (vp->v_tag == VT_HFS && nentries > 2) 7992 eofflag=0; 7993 7994 if (vp->v_tag == VT_NFS) { 7995 if (eofflag && !full_erase_flag) { 7996 full_erase_flag = 1; 7997 eofflag = 0; 7998 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ); 7999 } 8000 else if (!eofflag && full_erase_flag) 8001 full_erase_flag = 0; 8002 } 8003 8004 } while (!eofflag); 8005 /* 8006 * If we've made it here all the files in the dir are ._ files. 8007 * We can delete the files even though the node is suspended 8008 * because we are the owner of the file. 8009 */ 8010 8011 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ); 8012 eofflag = 0; 8013 full_erase_flag = 0; 8014 8015 do { 8016 siz = UIO_BUFF_SIZE; 8017 uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ); 8018 uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE); 8019 8020 error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx); 8021 8022 if (error != 0) 8023 goto outsc; 8024 8025 if (uio_resid(auio) != 0) 8026 siz -= uio_resid(auio); 8027 8028 /* 8029 * Iterate through directory 8030 */ 8031 cpos = rbuf; 8032 cend = rbuf + siz; 8033 dp = (struct dirent*) cpos; 8034 8035 if (cpos == cend) 8036 eofflag = 1; 8037 8038 while ((cpos < cend)) { 8039 /* 8040 * Check for . and .. as well as directories 8041 */ 8042 if (dp->d_ino != 0 && 8043 !((dp->d_namlen == 1 && dp->d_name[0] == '.') || 8044 (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.')) 8045 ) { 8046 8047 NDINIT(&nd_temp, DELETE, OP_UNLINK, USEDVP, 8048 UIO_SYSSPACE, CAST_USER_ADDR_T(dp->d_name), 8049 ctx); 8050 nd_temp.ni_dvp = vp; 8051 error = unlink1(ctx, &nd_temp, VNODE_REMOVE_SKIP_NAMESPACE_EVENT); 8052 8053 if (error && error != ENOENT) { 8054 goto outsc; 8055 } 8056 8057 } 8058 cpos += dp->d_reclen; 8059 dp = (struct dirent*)cpos; 8060 } 8061 8062 /* 8063 * workaround for HFS/NFS setting eofflag before end of file 8064 */ 8065 if (vp->v_tag == VT_HFS && nentries > 2) 8066 eofflag=0; 8067 8068 if (vp->v_tag == VT_NFS) { 8069 if (eofflag && !full_erase_flag) { 8070 full_erase_flag = 1; 8071 eofflag = 0; 8072 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ); 8073 } 8074 else if (!eofflag && full_erase_flag) 8075 full_erase_flag = 0; 8076 } 8077 8078 } while (!eofflag); 8079 8080 8081 error = 0; 8082 8083outsc: 8084 if (open_flag) 8085 VNOP_CLOSE(vp, FREAD, ctx); 8086 8087 uio_free(auio); 8088 FREE(rbuf, M_TEMP); 8089 8090 vnode_resume(vp); 8091 8092 8093 return(error); 8094 8095} 8096 8097 8098void 8099lock_vnode_and_post(vnode_t vp, int kevent_num) 8100{ 8101 /* Only take the lock if there's something there! */ 8102 if (vp->v_knotes.slh_first != NULL) { 8103 vnode_lock(vp); 8104 KNOTE(&vp->v_knotes, kevent_num); 8105 vnode_unlock(vp); 8106 } 8107} 8108 8109#ifdef JOE_DEBUG 8110static void record_vp(vnode_t vp, int count) { 8111 struct uthread *ut; 8112 8113#if CONFIG_TRIGGERS 8114 if (vp->v_resolve) 8115 return; 8116#endif 8117 if ((vp->v_flag & VSYSTEM)) 8118 return; 8119 8120 ut = get_bsdthread_info(current_thread()); 8121 ut->uu_iocount += count; 8122 8123 if (count == 1) { 8124 if (ut->uu_vpindex < 32) { 8125 OSBacktrace((void **)&ut->uu_pcs[ut->uu_vpindex][0], 10); 8126 8127 ut->uu_vps[ut->uu_vpindex] = vp; 8128 ut->uu_vpindex++; 8129 } 8130 } 8131} 8132#endif 8133 8134 8135#if CONFIG_TRIGGERS 8136 8137#define TRIG_DEBUG 0 8138 8139#if TRIG_DEBUG 8140#define TRIG_LOG(...) do { printf("%s: ", __FUNCTION__); printf(__VA_ARGS__); } while (0) 8141#else 8142#define TRIG_LOG(...) 8143#endif 8144 8145/* 8146 * Resolver result functions 8147 */ 8148 8149resolver_result_t 8150vfs_resolver_result(uint32_t seq, enum resolver_status stat, int aux) 8151{ 8152 /* 8153 * |<--- 32 --->|<--- 28 --->|<- 4 ->| 8154 * sequence auxiliary status 8155 */ 8156 return (((uint64_t)seq) << 32) | 8157 (((uint64_t)(aux & 0x0fffffff)) << 4) | 8158 (uint64_t)(stat & 0x0000000F); 8159} 8160 8161enum resolver_status 8162vfs_resolver_status(resolver_result_t result) 8163{ 8164 /* lower 4 bits is status */ 8165 return (result & 0x0000000F); 8166} 8167 8168uint32_t 8169vfs_resolver_sequence(resolver_result_t result) 8170{ 8171 /* upper 32 bits is sequence */ 8172 return (uint32_t)(result >> 32); 8173} 8174 8175int 8176vfs_resolver_auxiliary(resolver_result_t result) 8177{ 8178 /* 28 bits of auxiliary */ 8179 return (int)(((uint32_t)(result & 0xFFFFFFF0)) >> 4); 8180} 8181 8182/* 8183 * SPI 8184 * Call in for resolvers to update vnode trigger state 8185 */ 8186int 8187vnode_trigger_update(vnode_t vp, resolver_result_t result) 8188{ 8189 vnode_resolve_t rp; 8190 uint32_t seq; 8191 enum resolver_status stat; 8192 8193 if (vp->v_resolve == NULL) { 8194 return (EINVAL); 8195 } 8196 8197 stat = vfs_resolver_status(result); 8198 seq = vfs_resolver_sequence(result); 8199 8200 if ((stat != RESOLVER_RESOLVED) && (stat != RESOLVER_UNRESOLVED)) { 8201 return (EINVAL); 8202 } 8203 8204 rp = vp->v_resolve; 8205 lck_mtx_lock(&rp->vr_lock); 8206 8207 if (seq > rp->vr_lastseq) { 8208 if (stat == RESOLVER_RESOLVED) 8209 rp->vr_flags |= VNT_RESOLVED; 8210 else 8211 rp->vr_flags &= ~VNT_RESOLVED; 8212 8213 rp->vr_lastseq = seq; 8214 } 8215 8216 lck_mtx_unlock(&rp->vr_lock); 8217 8218 return (0); 8219} 8220 8221static int 8222vnode_resolver_attach(vnode_t vp, vnode_resolve_t rp, boolean_t ref) 8223{ 8224 int error; 8225 8226 vnode_lock_spin(vp); 8227 if (vp->v_resolve != NULL) { 8228 vnode_unlock(vp); 8229 return EINVAL; 8230 } else { 8231 vp->v_resolve = rp; 8232 } 8233 vnode_unlock(vp); 8234 8235 if (ref) { 8236 error = vnode_ref_ext(vp, O_EVTONLY, VNODE_REF_FORCE); 8237 if (error != 0) { 8238 panic("VNODE_REF_FORCE didn't help..."); 8239 } 8240 } 8241 8242 return 0; 8243} 8244 8245/* 8246 * VFS internal interfaces for vnode triggers 8247 * 8248 * vnode must already have an io count on entry 8249 * v_resolve is stable when io count is non-zero 8250 */ 8251static int 8252vnode_resolver_create(mount_t mp, vnode_t vp, struct vnode_trigger_param *tinfo, boolean_t external) 8253{ 8254 vnode_resolve_t rp; 8255 int result; 8256 char byte; 8257 8258#if 1 8259 /* minimum pointer test (debugging) */ 8260 if (tinfo->vnt_data) 8261 byte = *((char *)tinfo->vnt_data); 8262#endif 8263 MALLOC(rp, vnode_resolve_t, sizeof(*rp), M_TEMP, M_WAITOK); 8264 if (rp == NULL) 8265 return (ENOMEM); 8266 8267 lck_mtx_init(&rp->vr_lock, trigger_vnode_lck_grp, trigger_vnode_lck_attr); 8268 8269 rp->vr_resolve_func = tinfo->vnt_resolve_func; 8270 rp->vr_unresolve_func = tinfo->vnt_unresolve_func; 8271 rp->vr_rearm_func = tinfo->vnt_rearm_func; 8272 rp->vr_reclaim_func = tinfo->vnt_reclaim_func; 8273 rp->vr_data = tinfo->vnt_data; 8274 rp->vr_lastseq = 0; 8275 rp->vr_flags = tinfo->vnt_flags & VNT_VALID_MASK; 8276 if (external) { 8277 rp->vr_flags |= VNT_EXTERNAL; 8278 } 8279 8280 result = vnode_resolver_attach(vp, rp, external); 8281 if (result != 0) { 8282 goto out; 8283 } 8284 8285 if (mp) { 8286 OSAddAtomic(1, &mp->mnt_numtriggers); 8287 } 8288 8289 return (result); 8290 8291out: 8292 FREE(rp, M_TEMP); 8293 return result; 8294} 8295 8296static void 8297vnode_resolver_release(vnode_resolve_t rp) 8298{ 8299 /* 8300 * Give them a chance to free any private data 8301 */ 8302 if (rp->vr_data && rp->vr_reclaim_func) { 8303 rp->vr_reclaim_func(NULLVP, rp->vr_data); 8304 } 8305 8306 lck_mtx_destroy(&rp->vr_lock, trigger_vnode_lck_grp); 8307 FREE(rp, M_TEMP); 8308 8309} 8310 8311/* Called after the vnode has been drained */ 8312static void 8313vnode_resolver_detach(vnode_t vp) 8314{ 8315 vnode_resolve_t rp; 8316 mount_t mp; 8317 8318 mp = vnode_mount(vp); 8319 8320 vnode_lock(vp); 8321 rp = vp->v_resolve; 8322 vp->v_resolve = NULL; 8323 vnode_unlock(vp); 8324 8325 if ((rp->vr_flags & VNT_EXTERNAL) != 0) { 8326 vnode_rele_ext(vp, O_EVTONLY, 1); 8327 } 8328 8329 vnode_resolver_release(rp); 8330 8331 /* Keep count of active trigger vnodes per mount */ 8332 OSAddAtomic(-1, &mp->mnt_numtriggers); 8333} 8334 8335/* 8336 * Pathname operations that don't trigger a mount for trigger vnodes 8337 */ 8338static const u_int64_t ignorable_pathops_mask = 8339 1LL << OP_MOUNT | 8340 1LL << OP_UNMOUNT | 8341 1LL << OP_STATFS | 8342 1LL << OP_ACCESS | 8343 1LL << OP_GETATTR | 8344 1LL << OP_LISTXATTR; 8345 8346int 8347vfs_istraditionaltrigger(enum path_operation op, const struct componentname *cnp) 8348{ 8349 if (cnp->cn_flags & ISLASTCN) 8350 return ((1LL << op) & ignorable_pathops_mask) == 0; 8351 else 8352 return (1); 8353} 8354 8355__private_extern__ 8356void 8357vnode_trigger_rearm(vnode_t vp, vfs_context_t ctx) 8358{ 8359 vnode_resolve_t rp; 8360 resolver_result_t result; 8361 enum resolver_status status; 8362 uint32_t seq; 8363 8364 if ((vp->v_resolve == NULL) || 8365 (vp->v_resolve->vr_rearm_func == NULL) || 8366 (vp->v_resolve->vr_flags & VNT_AUTO_REARM) == 0) { 8367 return; 8368 } 8369 8370 rp = vp->v_resolve; 8371 lck_mtx_lock(&rp->vr_lock); 8372 8373 /* 8374 * Check if VFS initiated this unmount. If so, we'll catch it after the unresolve completes. 8375 */ 8376 if (rp->vr_flags & VNT_VFS_UNMOUNTED) { 8377 lck_mtx_unlock(&rp->vr_lock); 8378 return; 8379 } 8380 8381 /* Check if this vnode is already armed */ 8382 if ((rp->vr_flags & VNT_RESOLVED) == 0) { 8383 lck_mtx_unlock(&rp->vr_lock); 8384 return; 8385 } 8386 8387 lck_mtx_unlock(&rp->vr_lock); 8388 8389 result = rp->vr_rearm_func(vp, 0, rp->vr_data, ctx); 8390 status = vfs_resolver_status(result); 8391 seq = vfs_resolver_sequence(result); 8392 8393 lck_mtx_lock(&rp->vr_lock); 8394 if (seq > rp->vr_lastseq) { 8395 if (status == RESOLVER_UNRESOLVED) 8396 rp->vr_flags &= ~VNT_RESOLVED; 8397 rp->vr_lastseq = seq; 8398 } 8399 lck_mtx_unlock(&rp->vr_lock); 8400} 8401 8402__private_extern__ 8403int 8404vnode_trigger_resolve(vnode_t vp, struct nameidata *ndp, vfs_context_t ctx) 8405{ 8406 vnode_resolve_t rp; 8407 enum path_operation op; 8408 resolver_result_t result; 8409 enum resolver_status status; 8410 uint32_t seq; 8411 8412 /* Only trigger on topmost vnodes */ 8413 if ((vp->v_resolve == NULL) || 8414 (vp->v_resolve->vr_resolve_func == NULL) || 8415 (vp->v_mountedhere != NULL)) { 8416 return (0); 8417 } 8418 8419 rp = vp->v_resolve; 8420 lck_mtx_lock(&rp->vr_lock); 8421 8422 /* Check if this vnode is already resolved */ 8423 if (rp->vr_flags & VNT_RESOLVED) { 8424 lck_mtx_unlock(&rp->vr_lock); 8425 return (0); 8426 } 8427 8428 lck_mtx_unlock(&rp->vr_lock); 8429 8430 /* 8431 * XXX 8432 * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock) 8433 * is there anyway to know this??? 8434 * there can also be other legitimate lookups in parallel 8435 * 8436 * XXX - should we call this on a separate thread with a timeout? 8437 * 8438 * XXX - should we use ISLASTCN to pick the op value??? Perhaps only leafs should 8439 * get the richer set and non-leafs should get generic OP_LOOKUP? TBD 8440 */ 8441 op = (ndp->ni_op < OP_MAXOP) ? ndp->ni_op: OP_LOOKUP; 8442 8443 result = rp->vr_resolve_func(vp, &ndp->ni_cnd, op, 0, rp->vr_data, ctx); 8444 status = vfs_resolver_status(result); 8445 seq = vfs_resolver_sequence(result); 8446 8447 lck_mtx_lock(&rp->vr_lock); 8448 if (seq > rp->vr_lastseq) { 8449 if (status == RESOLVER_RESOLVED) 8450 rp->vr_flags |= VNT_RESOLVED; 8451 rp->vr_lastseq = seq; 8452 } 8453 lck_mtx_unlock(&rp->vr_lock); 8454 8455 /* On resolver errors, propagate the error back up */ 8456 return (status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0); 8457} 8458 8459static int 8460vnode_trigger_unresolve(vnode_t vp, int flags, vfs_context_t ctx) 8461{ 8462 vnode_resolve_t rp; 8463 resolver_result_t result; 8464 enum resolver_status status; 8465 uint32_t seq; 8466 8467 if ((vp->v_resolve == NULL) || (vp->v_resolve->vr_unresolve_func == NULL)) { 8468 return (0); 8469 } 8470 8471 rp = vp->v_resolve; 8472 lck_mtx_lock(&rp->vr_lock); 8473 8474 /* Check if this vnode is already resolved */ 8475 if ((rp->vr_flags & VNT_RESOLVED) == 0) { 8476 printf("vnode_trigger_unresolve: not currently resolved\n"); 8477 lck_mtx_unlock(&rp->vr_lock); 8478 return (0); 8479 } 8480 8481 rp->vr_flags |= VNT_VFS_UNMOUNTED; 8482 8483 lck_mtx_unlock(&rp->vr_lock); 8484 8485 /* 8486 * XXX 8487 * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock) 8488 * there can also be other legitimate lookups in parallel 8489 * 8490 * XXX - should we call this on a separate thread with a timeout? 8491 */ 8492 8493 result = rp->vr_unresolve_func(vp, flags, rp->vr_data, ctx); 8494 status = vfs_resolver_status(result); 8495 seq = vfs_resolver_sequence(result); 8496 8497 lck_mtx_lock(&rp->vr_lock); 8498 if (seq > rp->vr_lastseq) { 8499 if (status == RESOLVER_UNRESOLVED) 8500 rp->vr_flags &= ~VNT_RESOLVED; 8501 rp->vr_lastseq = seq; 8502 } 8503 rp->vr_flags &= ~VNT_VFS_UNMOUNTED; 8504 lck_mtx_unlock(&rp->vr_lock); 8505 8506 /* On resolver errors, propagate the error back up */ 8507 return (status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0); 8508} 8509 8510static int 8511triggerisdescendant(mount_t mp, mount_t rmp) 8512{ 8513 int match = FALSE; 8514 8515 /* 8516 * walk up vnode covered chain looking for a match 8517 */ 8518 name_cache_lock_shared(); 8519 8520 while (1) { 8521 vnode_t vp; 8522 8523 /* did we encounter "/" ? */ 8524 if (mp->mnt_flag & MNT_ROOTFS) 8525 break; 8526 8527 vp = mp->mnt_vnodecovered; 8528 if (vp == NULLVP) 8529 break; 8530 8531 mp = vp->v_mount; 8532 if (mp == rmp) { 8533 match = TRUE; 8534 break; 8535 } 8536 } 8537 8538 name_cache_unlock(); 8539 8540 return (match); 8541} 8542 8543struct trigger_unmount_info { 8544 vfs_context_t ctx; 8545 mount_t top_mp; 8546 vnode_t trigger_vp; 8547 mount_t trigger_mp; 8548 uint32_t trigger_vid; 8549 int flags; 8550}; 8551 8552static int 8553trigger_unmount_callback(mount_t mp, void * arg) 8554{ 8555 struct trigger_unmount_info * infop = (struct trigger_unmount_info *)arg; 8556 boolean_t mountedtrigger = FALSE; 8557 8558 /* 8559 * When we encounter the top level mount we're done 8560 */ 8561 if (mp == infop->top_mp) 8562 return (VFS_RETURNED_DONE); 8563 8564 if ((mp->mnt_vnodecovered == NULL) || 8565 (vnode_getwithref(mp->mnt_vnodecovered) != 0)) { 8566 return (VFS_RETURNED); 8567 } 8568 8569 if ((mp->mnt_vnodecovered->v_mountedhere == mp) && 8570 (mp->mnt_vnodecovered->v_resolve != NULL) && 8571 (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_RESOLVED)) { 8572 mountedtrigger = TRUE; 8573 } 8574 vnode_put(mp->mnt_vnodecovered); 8575 8576 /* 8577 * When we encounter a mounted trigger, check if its under the top level mount 8578 */ 8579 if ( !mountedtrigger || !triggerisdescendant(mp, infop->top_mp) ) 8580 return (VFS_RETURNED); 8581 8582 /* 8583 * Process any pending nested mount (now that its not referenced) 8584 */ 8585 if ((infop->trigger_vp != NULLVP) && 8586 (vnode_getwithvid(infop->trigger_vp, infop->trigger_vid) == 0)) { 8587 vnode_t vp = infop->trigger_vp; 8588 int error; 8589 8590 infop->trigger_vp = NULLVP; 8591 8592 if (mp == vp->v_mountedhere) { 8593 vnode_put(vp); 8594 printf("trigger_unmount_callback: unexpected match '%s'\n", 8595 mp->mnt_vfsstat.f_mntonname); 8596 return (VFS_RETURNED); 8597 } 8598 if (infop->trigger_mp != vp->v_mountedhere) { 8599 vnode_put(vp); 8600 printf("trigger_unmount_callback: trigger mnt changed! (%p != %p)\n", 8601 infop->trigger_mp, vp->v_mountedhere); 8602 goto savenext; 8603 } 8604 8605 error = vnode_trigger_unresolve(vp, infop->flags, infop->ctx); 8606 vnode_put(vp); 8607 if (error) { 8608 printf("unresolving: '%s', err %d\n", 8609 vp->v_mountedhere ? vp->v_mountedhere->mnt_vfsstat.f_mntonname : 8610 "???", error); 8611 return (VFS_RETURNED_DONE); /* stop iteration on errors */ 8612 } 8613 } 8614savenext: 8615 /* 8616 * We can't call resolver here since we hold a mount iter 8617 * ref on mp so save its covered vp for later processing 8618 */ 8619 infop->trigger_vp = mp->mnt_vnodecovered; 8620 if ((infop->trigger_vp != NULLVP) && 8621 (vnode_getwithref(infop->trigger_vp) == 0)) { 8622 if (infop->trigger_vp->v_mountedhere == mp) { 8623 infop->trigger_vid = infop->trigger_vp->v_id; 8624 infop->trigger_mp = mp; 8625 } 8626 vnode_put(infop->trigger_vp); 8627 } 8628 8629 return (VFS_RETURNED); 8630} 8631 8632/* 8633 * Attempt to unmount any trigger mounts nested underneath a mount. 8634 * This is a best effort attempt and no retries are performed here. 8635 * 8636 * Note: mp->mnt_rwlock is held exclusively on entry (so be carefull) 8637 */ 8638__private_extern__ 8639void 8640vfs_nested_trigger_unmounts(mount_t mp, int flags, vfs_context_t ctx) 8641{ 8642 struct trigger_unmount_info info; 8643 8644 /* Must have trigger vnodes */ 8645 if (mp->mnt_numtriggers == 0) { 8646 return; 8647 } 8648 /* Avoid recursive requests (by checking covered vnode) */ 8649 if ((mp->mnt_vnodecovered != NULL) && 8650 (vnode_getwithref(mp->mnt_vnodecovered) == 0)) { 8651 boolean_t recursive = FALSE; 8652 8653 if ((mp->mnt_vnodecovered->v_mountedhere == mp) && 8654 (mp->mnt_vnodecovered->v_resolve != NULL) && 8655 (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_VFS_UNMOUNTED)) { 8656 recursive = TRUE; 8657 } 8658 vnode_put(mp->mnt_vnodecovered); 8659 if (recursive) 8660 return; 8661 } 8662 8663 /* 8664 * Attempt to unmount any nested trigger mounts (best effort) 8665 */ 8666 info.ctx = ctx; 8667 info.top_mp = mp; 8668 info.trigger_vp = NULLVP; 8669 info.trigger_vid = 0; 8670 info.trigger_mp = NULL; 8671 info.flags = flags; 8672 8673 (void) vfs_iterate(VFS_ITERATE_TAIL_FIRST, trigger_unmount_callback, &info); 8674 8675 /* 8676 * Process remaining nested mount (now that its not referenced) 8677 */ 8678 if ((info.trigger_vp != NULLVP) && 8679 (vnode_getwithvid(info.trigger_vp, info.trigger_vid) == 0)) { 8680 vnode_t vp = info.trigger_vp; 8681 8682 if (info.trigger_mp == vp->v_mountedhere) { 8683 (void) vnode_trigger_unresolve(vp, flags, ctx); 8684 } 8685 vnode_put(vp); 8686 } 8687} 8688 8689int 8690vfs_addtrigger(mount_t mp, const char *relpath, struct vnode_trigger_info *vtip, vfs_context_t ctx) 8691{ 8692 struct nameidata nd; 8693 int res; 8694 vnode_t rvp, vp; 8695 struct vnode_trigger_param vtp; 8696 8697 /* 8698 * Must be called for trigger callback, wherein rwlock is held 8699 */ 8700 lck_rw_assert(&mp->mnt_rwlock, LCK_RW_ASSERT_HELD); 8701 8702 TRIG_LOG("Adding trigger at %s\n", relpath); 8703 TRIG_LOG("Trying VFS_ROOT\n"); 8704 8705 /* 8706 * We do a lookup starting at the root of the mountpoint, unwilling 8707 * to cross into other mountpoints. 8708 */ 8709 res = VFS_ROOT(mp, &rvp, ctx); 8710 if (res != 0) { 8711 goto out; 8712 } 8713 8714 TRIG_LOG("Trying namei\n"); 8715 8716 NDINIT(&nd, LOOKUP, OP_LOOKUP, USEDVP | NOCROSSMOUNT | FOLLOW, UIO_SYSSPACE, 8717 CAST_USER_ADDR_T(relpath), ctx); 8718 nd.ni_dvp = rvp; 8719 res = namei(&nd); 8720 if (res != 0) { 8721 vnode_put(rvp); 8722 goto out; 8723 } 8724 8725 vp = nd.ni_vp; 8726 nameidone(&nd); 8727 vnode_put(rvp); 8728 8729 TRIG_LOG("Trying vnode_resolver_create()\n"); 8730 8731 /* 8732 * Set up blob. vnode_create() takes a larger structure 8733 * with creation info, and we needed something different 8734 * for this case. One needs to win, or we need to munge both; 8735 * vnode_create() wins. 8736 */ 8737 bzero(&vtp, sizeof(vtp)); 8738 vtp.vnt_resolve_func = vtip->vti_resolve_func; 8739 vtp.vnt_unresolve_func = vtip->vti_unresolve_func; 8740 vtp.vnt_rearm_func = vtip->vti_rearm_func; 8741 vtp.vnt_reclaim_func = vtip->vti_reclaim_func; 8742 vtp.vnt_reclaim_func = vtip->vti_reclaim_func; 8743 vtp.vnt_data = vtip->vti_data; 8744 vtp.vnt_flags = vtip->vti_flags; 8745 8746 res = vnode_resolver_create(mp, vp, &vtp, TRUE); 8747 vnode_put(vp); 8748out: 8749 TRIG_LOG("Returning %d\n", res); 8750 return res; 8751} 8752 8753#endif /* CONFIG_TRIGGERS */ 8754