union_subr.c revision 71998
1/* 2 * Copyright (c) 1994 Jan-Simon Pendry 3 * Copyright (c) 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Jan-Simon Pendry. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)union_subr.c 8.20 (Berkeley) 5/20/95 38 * $FreeBSD: head/sys/fs/unionfs/union_subr.c 71998 2001-02-04 12:37:48Z phk $ 39 */ 40 41#include <sys/param.h> 42#include <sys/systm.h> 43#include <sys/kernel.h> 44#include <sys/vnode.h> 45#include <sys/namei.h> 46#include <sys/malloc.h> 47#include <sys/fcntl.h> 48#include <sys/file.h> 49#include <sys/filedesc.h> 50#include <sys/module.h> 51#include <sys/mount.h> 52#include <sys/stat.h> 53#include <vm/vm.h> 54#include <vm/vm_extern.h> /* for vnode_pager_setsize */ 55#include <vm/vm_zone.h> 56#include <vm/vm_object.h> /* for vm cache coherency */ 57#include <miscfs/union/union.h> 58 59#include <sys/proc.h> 60 61extern int union_init __P((void)); 62 63/* must be power of two, otherwise change UNION_HASH() */ 64#define NHASH 32 65 66/* unsigned int ... */ 67#define UNION_HASH(u, l) \ 68 (((((uintptr_t) (u)) + ((uintptr_t) l)) >> 8) & (NHASH-1)) 69 70static LIST_HEAD(unhead, union_node) unhead[NHASH]; 71static int unvplock[NHASH]; 72 73static void union_dircache_r __P((struct vnode *vp, struct vnode ***vppp, 74 int *cntp)); 75static int union_list_lock __P((int ix)); 76static void union_list_unlock __P((int ix)); 77static int union_relookup __P((struct union_mount *um, struct vnode *dvp, 78 struct vnode **vpp, 79 struct componentname *cnp, 80 struct componentname *cn, char *path, 81 int pathlen)); 82static void union_updatevp __P((struct union_node *un, 83 struct vnode *uppervp, 84 struct vnode *lowervp)); 85static void union_newlower __P((struct union_node *, struct vnode *)); 86static void union_newupper __P((struct union_node *, struct vnode *)); 87static int union_copyfile __P((struct vnode *, struct vnode *, 88 struct ucred *, struct proc *)); 89static int union_vn_create __P((struct vnode **, struct union_node *, 90 struct proc *)); 91static int union_vn_close __P((struct vnode *, int, struct ucred *, 92 struct proc *)); 93 94int 95union_init() 96{ 97 int i; 98 99 for (i = 0; i < NHASH; i++) 100 LIST_INIT(&unhead[i]); 101 bzero((caddr_t)unvplock, sizeof(unvplock)); 102 return (0); 103} 104 105static int 106union_list_lock(ix) 107 int ix; 108{ 109 if (unvplock[ix] & UNVP_LOCKED) { 110 unvplock[ix] |= UNVP_WANT; 111 (void) tsleep((caddr_t) &unvplock[ix], PINOD, "unllck", 0); 112 return (1); 113 } 114 unvplock[ix] |= UNVP_LOCKED; 115 return (0); 116} 117 118static void 119union_list_unlock(ix) 120 int ix; 121{ 122 unvplock[ix] &= ~UNVP_LOCKED; 123 124 if (unvplock[ix] & UNVP_WANT) { 125 unvplock[ix] &= ~UNVP_WANT; 126 wakeup((caddr_t) &unvplock[ix]); 127 } 128} 129 130/* 131 * union_updatevp: 132 * 133 * The uppervp, if not NULL, must be referenced and not locked by us 134 * The lowervp, if not NULL, must be referenced. 135 * 136 * if uppervp and lowervp match pointers already installed, nothing 137 * happens. The passed vp's (when matching) are not adjusted. This 138 * routine may only be called by union_newupper() and union_newlower(). 139 */ 140 141static void 142union_updatevp(un, uppervp, lowervp) 143 struct union_node *un; 144 struct vnode *uppervp; 145 struct vnode *lowervp; 146{ 147 int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp); 148 int nhash = UNION_HASH(uppervp, lowervp); 149 int docache = (lowervp != NULLVP || uppervp != NULLVP); 150 int lhash, uhash; 151 152 /* 153 * Ensure locking is ordered from lower to higher 154 * to avoid deadlocks. 155 */ 156 if (nhash < ohash) { 157 lhash = nhash; 158 uhash = ohash; 159 } else { 160 lhash = ohash; 161 uhash = nhash; 162 } 163 164 if (lhash != uhash) { 165 while (union_list_lock(lhash)) 166 continue; 167 } 168 169 while (union_list_lock(uhash)) 170 continue; 171 172 if (ohash != nhash || !docache) { 173 if (un->un_flags & UN_CACHED) { 174 un->un_flags &= ~UN_CACHED; 175 LIST_REMOVE(un, un_cache); 176 } 177 } 178 179 if (ohash != nhash) 180 union_list_unlock(ohash); 181 182 if (un->un_lowervp != lowervp) { 183 if (un->un_lowervp) { 184 vrele(un->un_lowervp); 185 if (un->un_path) { 186 free(un->un_path, M_TEMP); 187 un->un_path = 0; 188 } 189 } 190 un->un_lowervp = lowervp; 191 un->un_lowersz = VNOVAL; 192 } 193 194 if (un->un_uppervp != uppervp) { 195 if (un->un_uppervp) 196 vrele(un->un_uppervp); 197 un->un_uppervp = uppervp; 198 un->un_uppersz = VNOVAL; 199 } 200 201 if (docache && (ohash != nhash)) { 202 LIST_INSERT_HEAD(&unhead[nhash], un, un_cache); 203 un->un_flags |= UN_CACHED; 204 } 205 206 union_list_unlock(nhash); 207} 208 209/* 210 * Set a new lowervp. The passed lowervp must be referenced and will be 211 * stored in the vp in a referenced state. 212 */ 213 214static void 215union_newlower(un, lowervp) 216 struct union_node *un; 217 struct vnode *lowervp; 218{ 219 union_updatevp(un, un->un_uppervp, lowervp); 220} 221 222/* 223 * Set a new uppervp. The passed uppervp must be locked and will be 224 * stored in the vp in a locked state. The caller should not unlock 225 * uppervp. 226 */ 227 228static void 229union_newupper(un, uppervp) 230 struct union_node *un; 231 struct vnode *uppervp; 232{ 233 union_updatevp(un, uppervp, un->un_lowervp); 234} 235 236/* 237 * Keep track of size changes in the underlying vnodes. 238 * If the size changes, then callback to the vm layer 239 * giving priority to the upper layer size. 240 */ 241void 242union_newsize(vp, uppersz, lowersz) 243 struct vnode *vp; 244 off_t uppersz, lowersz; 245{ 246 struct union_node *un; 247 off_t sz; 248 249 /* only interested in regular files */ 250 if (vp->v_type != VREG) 251 return; 252 253 un = VTOUNION(vp); 254 sz = VNOVAL; 255 256 if ((uppersz != VNOVAL) && (un->un_uppersz != uppersz)) { 257 un->un_uppersz = uppersz; 258 if (sz == VNOVAL) 259 sz = un->un_uppersz; 260 } 261 262 if ((lowersz != VNOVAL) && (un->un_lowersz != lowersz)) { 263 un->un_lowersz = lowersz; 264 if (sz == VNOVAL) 265 sz = un->un_lowersz; 266 } 267 268 if (sz != VNOVAL) { 269 UDEBUG(("union: %s size now %ld\n", 270 (uppersz != VNOVAL ? "upper" : "lower"), (long)sz)); 271 vnode_pager_setsize(vp, sz); 272 } 273} 274 275/* 276 * union_allocvp: allocate a union_node and associate it with a 277 * parent union_node and one or two vnodes. 278 * 279 * vpp Holds the returned vnode locked and referenced if no 280 * error occurs. 281 * 282 * mp Holds the mount point. mp may or may not be busied. 283 * allocvp makes no changes to mp. 284 * 285 * dvp Holds the parent union_node to the one we wish to create. 286 * XXX may only be used to traverse an uncopied lowervp-based 287 * tree? XXX 288 * 289 * dvp may or may not be locked. allocvp makes no changes 290 * to dvp. 291 * 292 * upperdvp Holds the parent vnode to uppervp, generally used along 293 * with path component information to create a shadow of 294 * lowervp when uppervp does not exist. 295 * 296 * upperdvp is referenced but unlocked on entry, and will be 297 * dereferenced on return. 298 * 299 * uppervp Holds the new uppervp vnode to be stored in the 300 * union_node we are allocating. uppervp is referenced but 301 * not locked, and will be dereferenced on return. 302 * 303 * lowervp Holds the new lowervp vnode to be stored in the 304 * union_node we are allocating. uppervp is referenced but 305 * not locked, and will be dereferenced on return. 306 * 307 * cnp Holds path component information to be coupled with 308 * lowervp and upperdvp to allow unionfs to create an uppervp 309 * later on. Only used if lowervp is valid. The conents 310 * of cnp is only valid for the duration of the call. 311 * 312 * docache Determine whether this node should be entered in the 313 * cache or whether it should be destroyed as soon as possible. 314 * 315 * all union_nodes are maintained on a singly-linked 316 * list. new nodes are only allocated when they cannot 317 * be found on this list. entries on the list are 318 * removed when the vfs reclaim entry is called. 319 * 320 * a single lock is kept for the entire list. this is 321 * needed because the getnewvnode() function can block 322 * waiting for a vnode to become free, in which case there 323 * may be more than one process trying to get the same 324 * vnode. this lock is only taken if we are going to 325 * call getnewvnode, since the kernel itself is single-threaded. 326 * 327 * if an entry is found on the list, then call vget() to 328 * take a reference. this is done because there may be 329 * zero references to it and so it needs to removed from 330 * the vnode free list. 331 */ 332 333int 334union_allocvp(vpp, mp, dvp, upperdvp, cnp, uppervp, lowervp, docache) 335 struct vnode **vpp; 336 struct mount *mp; 337 struct vnode *dvp; /* parent union vnode */ 338 struct vnode *upperdvp; /* parent vnode of uppervp */ 339 struct componentname *cnp; /* may be null */ 340 struct vnode *uppervp; /* may be null */ 341 struct vnode *lowervp; /* may be null */ 342 int docache; 343{ 344 int error; 345 struct union_node *un = 0; 346 struct vnode *xlowervp = NULLVP; 347 struct union_mount *um = MOUNTTOUNIONMOUNT(mp); 348 struct proc *p = (cnp) ? cnp->cn_proc : curproc; 349 int hash = 0; 350 int vflag; 351 int try; 352 353 if (uppervp == NULLVP && lowervp == NULLVP) 354 panic("union: unidentifiable allocation"); 355 356 if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) { 357 xlowervp = lowervp; 358 lowervp = NULLVP; 359 } 360 361 /* detect the root vnode (and aliases) */ 362 vflag = 0; 363 if ((uppervp == um->um_uppervp) && 364 ((lowervp == NULLVP) || lowervp == um->um_lowervp)) { 365 if (lowervp == NULLVP) { 366 lowervp = um->um_lowervp; 367 if (lowervp != NULLVP) 368 VREF(lowervp); 369 } 370 vflag = VROOT; 371 } 372 373loop: 374 if (!docache) { 375 un = 0; 376 } else for (try = 0; try < 3; try++) { 377 switch (try) { 378 case 0: 379 if (lowervp == NULLVP) 380 continue; 381 hash = UNION_HASH(uppervp, lowervp); 382 break; 383 384 case 1: 385 if (uppervp == NULLVP) 386 continue; 387 hash = UNION_HASH(uppervp, NULLVP); 388 break; 389 390 case 2: 391 if (lowervp == NULLVP) 392 continue; 393 hash = UNION_HASH(NULLVP, lowervp); 394 break; 395 } 396 397 while (union_list_lock(hash)) 398 continue; 399 400 LIST_FOREACH(un, &unhead[hash], un_cache) { 401 if ((un->un_lowervp == lowervp || 402 un->un_lowervp == NULLVP) && 403 (un->un_uppervp == uppervp || 404 un->un_uppervp == NULLVP) && 405 (UNIONTOV(un)->v_mount == mp)) { 406 if (vget(UNIONTOV(un), 0, 407 cnp ? cnp->cn_proc : NULL)) { 408 union_list_unlock(hash); 409 goto loop; 410 } 411 break; 412 } 413 } 414 415 union_list_unlock(hash); 416 417 if (un) 418 break; 419 } 420 421 if (un) { 422 /* 423 * Obtain a lock on the union_node. Everything is unlocked 424 * except for dvp, so check that case. If they match, our 425 * new un is already locked. Otherwise we have to lock our 426 * new un. 427 * 428 * A potential deadlock situation occurs when we are holding 429 * one lock while trying to get another. We must follow 430 * strict ordering rules to avoid it. We try to locate dvp 431 * by scanning up from un_vnode, since the most likely 432 * scenario is un being under dvp. 433 */ 434 435 if (dvp && un->un_vnode != dvp) { 436 struct vnode *scan = un->un_vnode; 437 438 do { 439 scan = VTOUNION(scan)->un_pvp; 440 } while (scan && scan->v_tag == VT_UNION && scan != dvp); 441 if (scan != dvp) { 442 /* 443 * our new un is above dvp (we never saw dvp 444 * while moving up the tree). 445 */ 446 VREF(dvp); 447 VOP_UNLOCK(dvp, 0, p); 448 error = vn_lock(un->un_vnode, LK_EXCLUSIVE, p); 449 vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, p); 450 vrele(dvp); 451 } else { 452 /* 453 * our new un is under dvp 454 */ 455 error = vn_lock(un->un_vnode, LK_EXCLUSIVE, p); 456 } 457 } else if (dvp == NULLVP) { 458 /* 459 * dvp is NULL, we need to lock un. 460 */ 461 error = vn_lock(un->un_vnode, LK_EXCLUSIVE, p); 462 } else { 463 /* 464 * dvp == un->un_vnode, we are already locked. 465 */ 466 error = 0; 467 } 468 469 if (error) 470 goto loop; 471 472 /* 473 * At this point, the union_node is locked and referenced. 474 * 475 * uppervp is locked and referenced or NULL, lowervp is 476 * referenced or NULL. 477 */ 478 UDEBUG(("Modify existing un %p vn %p upper %p(refs %d) -> %p(refs %d)\n", 479 un, un->un_vnode, un->un_uppervp, 480 (un->un_uppervp ? un->un_uppervp->v_usecount : -99), 481 uppervp, 482 (uppervp ? uppervp->v_usecount : -99) 483 )); 484 485 if (uppervp != un->un_uppervp) { 486 KASSERT(uppervp == NULL || uppervp->v_usecount > 0, ("union_allocvp: too few refs %d (at least 1 required) on uppervp", uppervp->v_usecount)); 487 union_newupper(un, uppervp); 488 } else if (uppervp) { 489 KASSERT(uppervp->v_usecount > 1, ("union_allocvp: too few refs %d (at least 2 required) on uppervp", uppervp->v_usecount)); 490 vrele(uppervp); 491 } 492 493 /* 494 * Save information about the lower layer. 495 * This needs to keep track of pathname 496 * and directory information which union_vn_create 497 * might need. 498 */ 499 if (lowervp != un->un_lowervp) { 500 union_newlower(un, lowervp); 501 if (cnp && (lowervp != NULLVP)) { 502 un->un_path = malloc(cnp->cn_namelen+1, 503 M_TEMP, M_WAITOK); 504 bcopy(cnp->cn_nameptr, un->un_path, 505 cnp->cn_namelen); 506 un->un_path[cnp->cn_namelen] = '\0'; 507 } 508 } else if (lowervp) { 509 vrele(lowervp); 510 } 511 512 /* 513 * and upperdvp 514 */ 515 if (upperdvp != un->un_dirvp) { 516 if (un->un_dirvp) 517 vrele(un->un_dirvp); 518 un->un_dirvp = upperdvp; 519 } else if (upperdvp) { 520 vrele(upperdvp); 521 } 522 523 *vpp = UNIONTOV(un); 524 return (0); 525 } 526 527 if (docache) { 528 /* 529 * otherwise lock the vp list while we call getnewvnode 530 * since that can block. 531 */ 532 hash = UNION_HASH(uppervp, lowervp); 533 534 if (union_list_lock(hash)) 535 goto loop; 536 } 537 538 /* 539 * Create new node rather then replace old node 540 */ 541 542 error = getnewvnode(VT_UNION, mp, union_vnodeop_p, vpp); 543 if (error) { 544 /* 545 * If an error occurs clear out vnodes. 546 */ 547 if (lowervp) 548 vrele(lowervp); 549 if (uppervp) 550 vrele(uppervp); 551 if (upperdvp) 552 vrele(upperdvp); 553 *vpp = NULL; 554 goto out; 555 } 556 557 MALLOC((*vpp)->v_data, void *, sizeof(struct union_node), 558 M_TEMP, M_WAITOK); 559 560 (*vpp)->v_flag |= vflag; 561 if (uppervp) 562 (*vpp)->v_type = uppervp->v_type; 563 else 564 (*vpp)->v_type = lowervp->v_type; 565 566 un = VTOUNION(*vpp); 567 bzero(un, sizeof(*un)); 568 569 lockinit(&un->un_lock, PVFS, "unlock", 0, 0); 570 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, p); 571 572 un->un_vnode = *vpp; 573 un->un_uppervp = uppervp; 574 un->un_uppersz = VNOVAL; 575 un->un_lowervp = lowervp; 576 un->un_lowersz = VNOVAL; 577 un->un_dirvp = upperdvp; 578 un->un_pvp = dvp; /* only parent dir in new allocation */ 579 if (dvp != NULLVP) 580 VREF(dvp); 581 un->un_dircache = 0; 582 un->un_openl = 0; 583 584 if (cnp && (lowervp != NULLVP)) { 585 un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK); 586 bcopy(cnp->cn_nameptr, un->un_path, cnp->cn_namelen); 587 un->un_path[cnp->cn_namelen] = '\0'; 588 } else { 589 un->un_path = 0; 590 un->un_dirvp = NULL; 591 } 592 593 if (docache) { 594 LIST_INSERT_HEAD(&unhead[hash], un, un_cache); 595 un->un_flags |= UN_CACHED; 596 } 597 598out: 599 if (xlowervp) 600 vrele(xlowervp); 601 602 if (docache) 603 union_list_unlock(hash); 604 605 return (error); 606} 607 608int 609union_freevp(vp) 610 struct vnode *vp; 611{ 612 struct union_node *un = VTOUNION(vp); 613 614 if (un->un_flags & UN_CACHED) { 615 un->un_flags &= ~UN_CACHED; 616 LIST_REMOVE(un, un_cache); 617 } 618 619 if (un->un_pvp != NULLVP) { 620 vrele(un->un_pvp); 621 un->un_pvp = NULL; 622 } 623 if (un->un_uppervp != NULLVP) { 624 vrele(un->un_uppervp); 625 un->un_uppervp = NULL; 626 } 627 if (un->un_lowervp != NULLVP) { 628 vrele(un->un_lowervp); 629 un->un_lowervp = NULL; 630 } 631 if (un->un_dirvp != NULLVP) { 632 vrele(un->un_dirvp); 633 un->un_dirvp = NULL; 634 } 635 if (un->un_path) { 636 free(un->un_path, M_TEMP); 637 un->un_path = NULL; 638 } 639 lockdestroy(&un->un_lock); 640 641 FREE(vp->v_data, M_TEMP); 642 vp->v_data = 0; 643 644 return (0); 645} 646 647/* 648 * copyfile. copy the vnode (fvp) to the vnode (tvp) 649 * using a sequence of reads and writes. both (fvp) 650 * and (tvp) are locked on entry and exit. 651 * 652 * fvp and tvp are both exclusive locked on call, but their refcount's 653 * haven't been bumped at all. 654 */ 655static int 656union_copyfile(fvp, tvp, cred, p) 657 struct vnode *fvp; 658 struct vnode *tvp; 659 struct ucred *cred; 660 struct proc *p; 661{ 662 char *buf; 663 struct uio uio; 664 struct iovec iov; 665 int error = 0; 666 667 /* 668 * strategy: 669 * allocate a buffer of size MAXBSIZE. 670 * loop doing reads and writes, keeping track 671 * of the current uio offset. 672 * give up at the first sign of trouble. 673 */ 674 675 bzero(&uio, sizeof(uio)); 676 677 uio.uio_procp = p; 678 uio.uio_segflg = UIO_SYSSPACE; 679 uio.uio_offset = 0; 680 681 VOP_LEASE(fvp, p, cred, LEASE_READ); 682 VOP_LEASE(tvp, p, cred, LEASE_WRITE); 683 684 buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK); 685 686 /* ugly loop follows... */ 687 do { 688 off_t offset = uio.uio_offset; 689 int count; 690 int bufoffset; 691 692 /* 693 * Setup for big read 694 */ 695 uio.uio_iov = &iov; 696 uio.uio_iovcnt = 1; 697 iov.iov_base = buf; 698 iov.iov_len = MAXBSIZE; 699 uio.uio_resid = iov.iov_len; 700 uio.uio_rw = UIO_READ; 701 702 if ((error = VOP_READ(fvp, &uio, 0, cred)) != 0) 703 break; 704 705 /* 706 * Get bytes read, handle read eof case and setup for 707 * write loop 708 */ 709 if ((count = MAXBSIZE - uio.uio_resid) == 0) 710 break; 711 bufoffset = 0; 712 713 /* 714 * Write until an error occurs or our buffer has been 715 * exhausted, then update the offset for the next read. 716 */ 717 while (bufoffset < count) { 718 uio.uio_iov = &iov; 719 uio.uio_iovcnt = 1; 720 iov.iov_base = buf + bufoffset; 721 iov.iov_len = count - bufoffset; 722 uio.uio_offset = offset + bufoffset; 723 uio.uio_rw = UIO_WRITE; 724 uio.uio_resid = iov.iov_len; 725 726 if ((error = VOP_WRITE(tvp, &uio, 0, cred)) != 0) 727 break; 728 bufoffset += (count - bufoffset) - uio.uio_resid; 729 } 730 uio.uio_offset = offset + bufoffset; 731 } while (error == 0); 732 733 free(buf, M_TEMP); 734 return (error); 735} 736 737/* 738 * 739 * un's vnode is assumed to be locked on entry and remains locked on exit. 740 */ 741 742int 743union_copyup(un, docopy, cred, p) 744 struct union_node *un; 745 int docopy; 746 struct ucred *cred; 747 struct proc *p; 748{ 749 int error; 750 struct mount *mp; 751 struct vnode *lvp, *uvp; 752 753 /* 754 * If the user does not have read permission, the vnode should not 755 * be copied to upper layer. 756 */ 757 vn_lock(un->un_lowervp, LK_EXCLUSIVE | LK_RETRY, p); 758 error = VOP_ACCESS(un->un_lowervp, VREAD, cred, p); 759 VOP_UNLOCK(un->un_lowervp, 0, p); 760 if (error) 761 return (error); 762 763 if ((error = vn_start_write(un->un_dirvp, &mp, V_WAIT | PCATCH)) != 0) 764 return (error); 765 if ((error = union_vn_create(&uvp, un, p)) != 0) { 766 vn_finished_write(mp); 767 return (error); 768 } 769 770 lvp = un->un_lowervp; 771 772 KASSERT(uvp->v_usecount > 0, ("copy: uvp refcount 0: %d", uvp->v_usecount)); 773 if (docopy) { 774 /* 775 * XX - should not ignore errors 776 * from VOP_CLOSE 777 */ 778 vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY, p); 779 error = VOP_OPEN(lvp, FREAD, cred, p); 780 if (error == 0 && vn_canvmio(lvp) == TRUE) 781 error = vfs_object_create(lvp, p, cred); 782 if (error == 0) { 783 error = union_copyfile(lvp, uvp, cred, p); 784 VOP_UNLOCK(lvp, 0, p); 785 (void) VOP_CLOSE(lvp, FREAD, cred, p); 786 } 787 if (error == 0) 788 UDEBUG(("union: copied up %s\n", un->un_path)); 789 790 } 791 VOP_UNLOCK(uvp, 0, p); 792 vn_finished_write(mp); 793 union_newupper(un, uvp); 794 KASSERT(uvp->v_usecount > 0, ("copy: uvp refcount 0: %d", uvp->v_usecount)); 795 union_vn_close(uvp, FWRITE, cred, p); 796 KASSERT(uvp->v_usecount > 0, ("copy: uvp refcount 0: %d", uvp->v_usecount)); 797 /* 798 * Subsequent IOs will go to the top layer, so 799 * call close on the lower vnode and open on the 800 * upper vnode to ensure that the filesystem keeps 801 * its references counts right. This doesn't do 802 * the right thing with (cred) and (FREAD) though. 803 * Ignoring error returns is not right, either. 804 */ 805 if (error == 0) { 806 int i; 807 808 for (i = 0; i < un->un_openl; i++) { 809 (void) VOP_CLOSE(lvp, FREAD, cred, p); 810 (void) VOP_OPEN(uvp, FREAD, cred, p); 811 } 812 if (un->un_openl) { 813 if (vn_canvmio(uvp) == TRUE) 814 error = vfs_object_create(uvp, p, cred); 815 } 816 un->un_openl = 0; 817 } 818 819 return (error); 820 821} 822 823/* 824 * union_relookup: 825 * 826 * dvp should be locked on entry and will be locked on return. No 827 * net change in the ref count will occur. 828 * 829 * If an error is returned, *vpp will be invalid, otherwise it 830 * will hold a locked, referenced vnode. If *vpp == dvp then 831 * remember that only one exclusive lock is held. 832 */ 833 834static int 835union_relookup(um, dvp, vpp, cnp, cn, path, pathlen) 836 struct union_mount *um; 837 struct vnode *dvp; 838 struct vnode **vpp; 839 struct componentname *cnp; 840 struct componentname *cn; 841 char *path; 842 int pathlen; 843{ 844 int error; 845 846 /* 847 * A new componentname structure must be faked up because 848 * there is no way to know where the upper level cnp came 849 * from or what it is being used for. This must duplicate 850 * some of the work done by NDINIT, some of the work done 851 * by namei, some of the work done by lookup and some of 852 * the work done by VOP_LOOKUP when given a CREATE flag. 853 * Conclusion: Horrible. 854 */ 855 cn->cn_namelen = pathlen; 856 cn->cn_pnbuf = zalloc(namei_zone); 857 bcopy(path, cn->cn_pnbuf, cn->cn_namelen); 858 cn->cn_pnbuf[cn->cn_namelen] = '\0'; 859 860 cn->cn_nameiop = CREATE; 861 cn->cn_flags = (LOCKPARENT|LOCKLEAF|HASBUF|SAVENAME|ISLASTCN); 862 cn->cn_proc = cnp->cn_proc; 863 if (um->um_op == UNMNT_ABOVE) 864 cn->cn_cred = cnp->cn_cred; 865 else 866 cn->cn_cred = um->um_cred; 867 cn->cn_nameptr = cn->cn_pnbuf; 868 cn->cn_consume = cnp->cn_consume; 869 870 VREF(dvp); 871 VOP_UNLOCK(dvp, 0, cnp->cn_proc); 872 873 /* 874 * Pass dvp unlocked and referenced on call to relookup(). 875 * 876 * If an error occurs, dvp will be returned unlocked and dereferenced. 877 */ 878 879 if ((error = relookup(dvp, vpp, cn)) != 0) { 880 vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, cnp->cn_proc); 881 return(error); 882 } 883 884 /* 885 * If no error occurs, dvp will be returned locked with the reference 886 * left as before, and vpp will be returned referenced and locked. 887 * 888 * We want to return with dvp as it was passed to us, so we get 889 * rid of our reference. 890 */ 891 vrele(dvp); 892 return (0); 893} 894 895/* 896 * Create a shadow directory in the upper layer. 897 * The new vnode is returned locked. 898 * 899 * (um) points to the union mount structure for access to the 900 * the mounting process's credentials. 901 * (dvp) is the directory in which to create the shadow directory, 902 * it is locked (but not ref'd) on entry and return. 903 * (cnp) is the componentname to be created. 904 * (vpp) is the returned newly created shadow directory, which 905 * is returned locked and ref'd 906 */ 907int 908union_mkshadow(um, dvp, cnp, vpp) 909 struct union_mount *um; 910 struct vnode *dvp; 911 struct componentname *cnp; 912 struct vnode **vpp; 913{ 914 int error; 915 struct vattr va; 916 struct proc *p = cnp->cn_proc; 917 struct componentname cn; 918 struct mount *mp; 919 920 if ((error = vn_start_write(dvp, &mp, V_WAIT | PCATCH)) != 0) 921 return (error); 922 if ((error = union_relookup(um, dvp, vpp, cnp, &cn, 923 cnp->cn_nameptr, cnp->cn_namelen)) != 0) { 924 vn_finished_write(mp); 925 return (error); 926 } 927 928 if (*vpp) { 929 if (cn.cn_flags & HASBUF) { 930 zfree(namei_zone, cn.cn_pnbuf); 931 cn.cn_flags &= ~HASBUF; 932 } 933 if (dvp == *vpp) 934 vrele(*vpp); 935 else 936 vput(*vpp); 937 vn_finished_write(mp); 938 *vpp = NULLVP; 939 return (EEXIST); 940 } 941 942 /* 943 * policy: when creating the shadow directory in the 944 * upper layer, create it owned by the user who did 945 * the mount, group from parent directory, and mode 946 * 777 modified by umask (ie mostly identical to the 947 * mkdir syscall). (jsp, kb) 948 */ 949 950 VATTR_NULL(&va); 951 va.va_type = VDIR; 952 va.va_mode = um->um_cmode; 953 954 /* VOP_LEASE: dvp is locked */ 955 VOP_LEASE(dvp, p, cn.cn_cred, LEASE_WRITE); 956 957 error = VOP_MKDIR(dvp, vpp, &cn, &va); 958 if (cn.cn_flags & HASBUF) { 959 zfree(namei_zone, cn.cn_pnbuf); 960 cn.cn_flags &= ~HASBUF; 961 } 962 /*vput(dvp);*/ 963 vn_finished_write(mp); 964 return (error); 965} 966 967/* 968 * Create a whiteout entry in the upper layer. 969 * 970 * (um) points to the union mount structure for access to the 971 * the mounting process's credentials. 972 * (dvp) is the directory in which to create the whiteout. 973 * it is locked on entry and return. 974 * (cnp) is the componentname to be created. 975 */ 976int 977union_mkwhiteout(um, dvp, cnp, path) 978 struct union_mount *um; 979 struct vnode *dvp; 980 struct componentname *cnp; 981 char *path; 982{ 983 int error; 984 struct proc *p = cnp->cn_proc; 985 struct vnode *wvp; 986 struct componentname cn; 987 struct mount *mp; 988 989 if ((error = vn_start_write(dvp, &mp, V_WAIT | PCATCH)) != 0) 990 return (error); 991 error = union_relookup(um, dvp, &wvp, cnp, &cn, path, strlen(path)); 992 if (error) { 993 vn_finished_write(mp); 994 return (error); 995 } 996 997 if (wvp) { 998 if (cn.cn_flags & HASBUF) { 999 zfree(namei_zone, cn.cn_pnbuf); 1000 cn.cn_flags &= ~HASBUF; 1001 } 1002 if (wvp == dvp) 1003 vrele(wvp); 1004 else 1005 vput(wvp); 1006 vn_finished_write(mp); 1007 return (EEXIST); 1008 } 1009 1010 /* VOP_LEASE: dvp is locked */ 1011 VOP_LEASE(dvp, p, p->p_ucred, LEASE_WRITE); 1012 1013 error = VOP_WHITEOUT(dvp, &cn, CREATE); 1014 if (cn.cn_flags & HASBUF) { 1015 zfree(namei_zone, cn.cn_pnbuf); 1016 cn.cn_flags &= ~HASBUF; 1017 } 1018 vn_finished_write(mp); 1019 return (error); 1020} 1021 1022/* 1023 * union_vn_create: creates and opens a new shadow file 1024 * on the upper union layer. this function is similar 1025 * in spirit to calling vn_open but it avoids calling namei(). 1026 * the problem with calling namei is that a) it locks too many 1027 * things, and b) it doesn't start at the "right" directory, 1028 * whereas relookup is told where to start. 1029 * 1030 * On entry, the vnode associated with un is locked. It remains locked 1031 * on return. 1032 * 1033 * If no error occurs, *vpp contains a locked referenced vnode for your 1034 * use. If an error occurs *vpp iis undefined. 1035 */ 1036static int 1037union_vn_create(vpp, un, p) 1038 struct vnode **vpp; 1039 struct union_node *un; 1040 struct proc *p; 1041{ 1042 struct vnode *vp; 1043 struct ucred *cred = p->p_ucred; 1044 struct vattr vat; 1045 struct vattr *vap = &vat; 1046 int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL); 1047 int error; 1048 int cmode = UN_FILEMODE & ~p->p_fd->fd_cmask; 1049 struct componentname cn; 1050 1051 *vpp = NULLVP; 1052 1053 /* 1054 * Build a new componentname structure (for the same 1055 * reasons outlines in union_mkshadow). 1056 * The difference here is that the file is owned by 1057 * the current user, rather than by the person who 1058 * did the mount, since the current user needs to be 1059 * able to write the file (that's why it is being 1060 * copied in the first place). 1061 */ 1062 cn.cn_namelen = strlen(un->un_path); 1063 cn.cn_pnbuf = zalloc(namei_zone); 1064 bcopy(un->un_path, cn.cn_pnbuf, cn.cn_namelen+1); 1065 cn.cn_nameiop = CREATE; 1066 cn.cn_flags = (LOCKPARENT|LOCKLEAF|HASBUF|SAVENAME|ISLASTCN); 1067 cn.cn_proc = p; 1068 cn.cn_cred = p->p_ucred; 1069 cn.cn_nameptr = cn.cn_pnbuf; 1070 cn.cn_consume = 0; 1071 1072 /* 1073 * Pass dvp unlocked and referenced on call to relookup(). 1074 * 1075 * If an error occurs, dvp will be returned unlocked and dereferenced. 1076 */ 1077 VREF(un->un_dirvp); 1078 error = relookup(un->un_dirvp, &vp, &cn); 1079 if (error) 1080 return (error); 1081 1082 /* 1083 * If no error occurs, dvp will be returned locked with the reference 1084 * left as before, and vpp will be returned referenced and locked. 1085 */ 1086 if (vp) { 1087 vput(un->un_dirvp); 1088 if (cn.cn_flags & HASBUF) { 1089 zfree(namei_zone, cn.cn_pnbuf); 1090 cn.cn_flags &= ~HASBUF; 1091 } 1092 if (vp == un->un_dirvp) 1093 vrele(vp); 1094 else 1095 vput(vp); 1096 return (EEXIST); 1097 } 1098 1099 /* 1100 * Good - there was no race to create the file 1101 * so go ahead and create it. The permissions 1102 * on the file will be 0666 modified by the 1103 * current user's umask. Access to the file, while 1104 * it is unioned, will require access to the top *and* 1105 * bottom files. Access when not unioned will simply 1106 * require access to the top-level file. 1107 * TODO: confirm choice of access permissions. 1108 */ 1109 VATTR_NULL(vap); 1110 vap->va_type = VREG; 1111 vap->va_mode = cmode; 1112 VOP_LEASE(un->un_dirvp, p, cred, LEASE_WRITE); 1113 error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap); 1114 if (cn.cn_flags & HASBUF) { 1115 zfree(namei_zone, cn.cn_pnbuf); 1116 cn.cn_flags &= ~HASBUF; 1117 } 1118 vput(un->un_dirvp); 1119 if (error) 1120 return (error); 1121 1122 error = VOP_OPEN(vp, fmode, cred, p); 1123 if (error == 0 && vn_canvmio(vp) == TRUE) 1124 error = vfs_object_create(vp, p, cred); 1125 if (error) { 1126 vput(vp); 1127 return (error); 1128 } 1129 vp->v_writecount++; 1130 *vpp = vp; 1131 return (0); 1132} 1133 1134static int 1135union_vn_close(vp, fmode, cred, p) 1136 struct vnode *vp; 1137 int fmode; 1138 struct ucred *cred; 1139 struct proc *p; 1140{ 1141 1142 if (fmode & FWRITE) 1143 --vp->v_writecount; 1144 return (VOP_CLOSE(vp, fmode, cred, p)); 1145} 1146 1147#if 0 1148 1149/* 1150 * union_removed_upper: 1151 * 1152 * called with union_node unlocked. XXX 1153 */ 1154 1155void 1156union_removed_upper(un) 1157 struct union_node *un; 1158{ 1159 struct proc *p = curproc; /* XXX */ 1160 struct vnode **vpp; 1161 1162 /* 1163 * Do not set the uppervp to NULLVP. If lowervp is NULLVP, 1164 * union node will have neither uppervp nor lowervp. We remove 1165 * the union node from cache, so that it will not be referrenced. 1166 */ 1167 union_newupper(un, NULLVP); 1168 if (un->un_dircache != 0) { 1169 for (vpp = un->un_dircache; *vpp != NULLVP; vpp++) 1170 vrele(*vpp); 1171 free(un->un_dircache, M_TEMP); 1172 un->un_dircache = 0; 1173 } 1174 1175 if (un->un_flags & UN_CACHED) { 1176 un->un_flags &= ~UN_CACHED; 1177 LIST_REMOVE(un, un_cache); 1178 } 1179} 1180 1181#endif 1182 1183/* 1184 * determine whether a whiteout is needed 1185 * during a remove/rmdir operation. 1186 */ 1187int 1188union_dowhiteout(un, cred, p) 1189 struct union_node *un; 1190 struct ucred *cred; 1191 struct proc *p; 1192{ 1193 struct vattr va; 1194 1195 if (un->un_lowervp != NULLVP) 1196 return (1); 1197 1198 if (VOP_GETATTR(un->un_uppervp, &va, cred, p) == 0 && 1199 (va.va_flags & OPAQUE)) 1200 return (1); 1201 1202 return (0); 1203} 1204 1205static void 1206union_dircache_r(vp, vppp, cntp) 1207 struct vnode *vp; 1208 struct vnode ***vppp; 1209 int *cntp; 1210{ 1211 struct union_node *un; 1212 1213 if (vp->v_op != union_vnodeop_p) { 1214 if (vppp) { 1215 VREF(vp); 1216 *(*vppp)++ = vp; 1217 if (--(*cntp) == 0) 1218 panic("union: dircache table too small"); 1219 } else { 1220 (*cntp)++; 1221 } 1222 1223 return; 1224 } 1225 1226 un = VTOUNION(vp); 1227 if (un->un_uppervp != NULLVP) 1228 union_dircache_r(un->un_uppervp, vppp, cntp); 1229 if (un->un_lowervp != NULLVP) 1230 union_dircache_r(un->un_lowervp, vppp, cntp); 1231} 1232 1233struct vnode * 1234union_dircache(vp, p) 1235 struct vnode *vp; 1236 struct proc *p; 1237{ 1238 int cnt; 1239 struct vnode *nvp; 1240 struct vnode **vpp; 1241 struct vnode **dircache; 1242 struct union_node *un; 1243 int error; 1244 1245 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 1246 dircache = VTOUNION(vp)->un_dircache; 1247 1248 nvp = NULLVP; 1249 1250 if (dircache == NULL) { 1251 cnt = 0; 1252 union_dircache_r(vp, 0, &cnt); 1253 cnt++; 1254 dircache = malloc(cnt * sizeof(struct vnode *), 1255 M_TEMP, M_WAITOK); 1256 vpp = dircache; 1257 union_dircache_r(vp, &vpp, &cnt); 1258 *vpp = NULLVP; 1259 vpp = dircache + 1; 1260 } else { 1261 vpp = dircache; 1262 do { 1263 if (*vpp++ == VTOUNION(vp)->un_uppervp) 1264 break; 1265 } while (*vpp != NULLVP); 1266 } 1267 1268 if (*vpp == NULLVP) 1269 goto out; 1270 1271 /*vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, p);*/ 1272 UDEBUG(("ALLOCVP-3 %p ref %d\n", *vpp, (*vpp ? (*vpp)->v_usecount : -99))); 1273 VREF(*vpp); 1274 error = union_allocvp(&nvp, vp->v_mount, NULLVP, NULLVP, NULL, *vpp, NULLVP, 0); 1275 UDEBUG(("ALLOCVP-3B %p ref %d\n", nvp, (*vpp ? (*vpp)->v_usecount : -99))); 1276 if (error) 1277 goto out; 1278 1279 VTOUNION(vp)->un_dircache = 0; 1280 un = VTOUNION(nvp); 1281 un->un_dircache = dircache; 1282 1283out: 1284 VOP_UNLOCK(vp, 0, p); 1285 return (nvp); 1286} 1287 1288/* 1289 * Guarentee coherency with the VM cache by invalidating any clean VM pages 1290 * associated with this write and updating any dirty VM pages. Since our 1291 * vnode is locked, other processes will not be able to read the pages in 1292 * again until after our write completes. 1293 * 1294 * We also have to be coherent with reads, by flushing any pending dirty 1295 * pages prior to issuing the read. 1296 * 1297 * XXX this is somewhat of a hack at the moment. To support this properly 1298 * we would have to be able to run VOP_READ and VOP_WRITE through the VM 1299 * cache. Then we wouldn't need to worry about coherency. 1300 */ 1301 1302void 1303union_vm_coherency(struct vnode *vp, struct uio *uio, int cleanfls) 1304{ 1305 vm_object_t object; 1306 vm_pindex_t pstart; 1307 vm_pindex_t pend; 1308 int pgoff; 1309 1310 if ((object = vp->v_object) == NULL) 1311 return; 1312 1313 pgoff = uio->uio_offset & PAGE_MASK; 1314 pstart = uio->uio_offset / PAGE_SIZE; 1315 pend = pstart + (uio->uio_resid + pgoff + PAGE_MASK) / PAGE_SIZE; 1316 1317 vm_object_page_clean(object, pstart, pend, OBJPC_SYNC); 1318 if (cleanfls) 1319 vm_object_page_remove(object, pstart, pend, TRUE); 1320} 1321 1322/* 1323 * Module glue to remove #ifdef UNION from vfs_syscalls.c 1324 */ 1325static int 1326union_dircheck(struct proc *p, struct vnode **vp, struct file *fp) 1327{ 1328 int error = 0; 1329 1330 if ((*vp)->v_op == union_vnodeop_p) { 1331 struct vnode *lvp; 1332 1333 lvp = union_dircache(*vp, p); 1334 if (lvp != NULLVP) { 1335 struct vattr va; 1336 1337 /* 1338 * If the directory is opaque, 1339 * then don't show lower entries 1340 */ 1341 error = VOP_GETATTR(*vp, &va, fp->f_cred, p); 1342 if (va.va_flags & OPAQUE) { 1343 vput(lvp); 1344 lvp = NULL; 1345 } 1346 } 1347 1348 if (lvp != NULLVP) { 1349 error = VOP_OPEN(lvp, FREAD, fp->f_cred, p); 1350 if (error == 0 && vn_canvmio(lvp) == TRUE) 1351 error = vfs_object_create(lvp, p, fp->f_cred); 1352 if (error) { 1353 vput(lvp); 1354 return (error); 1355 } 1356 VOP_UNLOCK(lvp, 0, p); 1357 fp->f_data = (caddr_t) lvp; 1358 fp->f_offset = 0; 1359 error = vn_close(*vp, FREAD, fp->f_cred, p); 1360 if (error) 1361 return (error); 1362 *vp = lvp; 1363 return -1; /* goto unionread */ 1364 } 1365 } 1366 return error; 1367} 1368 1369static int 1370union_modevent(module_t mod, int type, void *data) 1371{ 1372 switch (type) { 1373 case MOD_LOAD: 1374 union_dircheckp = union_dircheck; 1375 break; 1376 case MOD_UNLOAD: 1377 union_dircheckp = NULL; 1378 break; 1379 default: 1380 break; 1381 } 1382 return 0; 1383} 1384 1385static moduledata_t union_mod = { 1386 "union_dircheck", 1387 union_modevent, 1388 NULL 1389}; 1390 1391DECLARE_MODULE(union_dircheck, union_mod, SI_SUB_VFS, SI_ORDER_ANY); 1392