1/* 2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * Copyright (c) 1982, 1986, 1989, 1993 30 * The Regents of the University of California. All rights reserved. 31 * 32 * This code is derived from software contributed to Berkeley by 33 * Scooter Morris at Genentech Inc. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 4. Neither the name of the University nor the names of its contributors 44 * may be used to endorse or promote products derived from this software 45 * without specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94 60 */ 61 62#include <sys/cdefs.h> 63#include <sys/param.h> 64#include <sys/systm.h> 65#include <sys/kernel.h> 66#include <sys/lock.h> 67#include <sys/mount.h> 68#include <sys/proc.h> 69#include <sys/signalvar.h> 70#include <sys/unistd.h> 71#include <sys/user.h> 72#include <sys/vnode.h> 73#include <sys/vnode_internal.h> 74#include <sys/vnode_if.h> 75#include <sys/malloc.h> 76#include <sys/fcntl.h> 77#include <sys/lockf.h> 78 79/* 80 * This variable controls the maximum number of processes that will 81 * be checked in doing deadlock detection. 82 */ 83static int maxlockdepth = MAXDEPTH; 84 85#ifdef LOCKF_DEBUGGING 86#include <sys/sysctl.h> 87#include <ufs/ufs/quota.h> 88#include <ufs/ufs/inode.h> 89void lf_print(const char *tag, struct lockf *lock); 90void lf_printlist(const char *tag, struct lockf *lock); 91static int lockf_debug = 2; 92SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &lockf_debug, 0, ""); 93 94/* 95 * If there is no mask bit selector, or there is on, and the selector is 96 * set, then output the debugging diagnostic. 97 */ 98#define LOCKF_DEBUG(mask, ...) \ 99 do { \ 100 if( !(mask) || ((mask) & lockf_debug)) { \ 101 printf(__VA_ARGS__); \ 102 } \ 103 } while(0) 104#else /* !LOCKF_DEBUGGING */ 105#define LOCKF_DEBUG(mask, ...) /* mask */ 106#endif /* !LOCKF_DEBUGGING */ 107 108MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures"); 109 110#define NOLOCKF (struct lockf *)0 111#define SELF 0x1 112#define OTHERS 0x2 113#define OFF_MAX 0x7fffffffffffffffULL /* max off_t */ 114 115/* 116 * Overlapping lock states 117 */ 118typedef enum { 119 OVERLAP_NONE = 0, 120 OVERLAP_EQUALS_LOCK, 121 OVERLAP_CONTAINS_LOCK, 122 OVERLAP_CONTAINED_BY_LOCK, 123 OVERLAP_STARTS_BEFORE_LOCK, 124 OVERLAP_ENDS_AFTER_LOCK 125} overlap_t; 126 127static int lf_clearlock(struct lockf *); 128static overlap_t lf_findoverlap(struct lockf *, 129 struct lockf *, int, struct lockf ***, struct lockf **); 130static struct lockf *lf_getblock(struct lockf *, pid_t); 131static int lf_getlock(struct lockf *, struct flock *, pid_t); 132static int lf_setlock(struct lockf *); 133static int lf_split(struct lockf *, struct lockf *); 134static void lf_wakelock(struct lockf *, boolean_t); 135 136/* 137 * lf_advlock 138 * 139 * Description: Advisory record locking support 140 * 141 * Parameters: ap Argument pointer to a vnop_advlock_args 142 * argument descriptor structure for the 143 * lock operation to be attempted. 144 * 145 * Returns: 0 Success 146 * EOVERFLOW 147 * EINVAL 148 * ENOLCK Number of locked regions exceeds limit 149 * lf_setlock:EAGAIN 150 * lf_setlock:EDEADLK 151 * lf_setlock:EINTR 152 * lf_setlock:ENOLCK 153 * lf_clearlock:ENOLCK 154 * vnode_size:??? 155 * 156 * Notes: We return ENOLCK when we run out of memory to support locks; as 157 * such, there is no specific expectation limit other than the 158 * amount of available resources. 159 */ 160int 161lf_advlock(struct vnop_advlock_args *ap) 162{ 163 struct vnode *vp = ap->a_vp; 164 struct flock *fl = ap->a_fl; 165 vfs_context_t context = ap->a_context; 166 struct lockf *lock; 167 off_t start, end, oadd; 168 u_quad_t size; 169 int error; 170 struct lockf **head = &vp->v_lockf; 171 172 /* XXX HFS may need a !vnode_isreg(vp) EISDIR error here */ 173 174 /* 175 * Avoid the common case of unlocking when inode has no locks. 176 */ 177 if (*head == (struct lockf *)0) { 178 if (ap->a_op != F_SETLK) { 179 fl->l_type = F_UNLCK; 180 LOCKF_DEBUG(0, "lf_advlock: '%s' unlock without lock\n", vfs_context_proc(context)->p_comm); 181 return (0); 182 } 183 } 184 185 /* 186 * Convert the flock structure into a start and end. 187 */ 188 switch (fl->l_whence) { 189 190 case SEEK_SET: 191 case SEEK_CUR: 192 /* 193 * Caller is responsible for adding any necessary offset 194 * when SEEK_CUR is used. 195 */ 196 start = fl->l_start; 197 break; 198 199 case SEEK_END: 200 201 /* 202 * It's OK to cast the u_quad_t to and off_t here, since they 203 * are the same storage size, and the value of the returned 204 * contents will never overflow into the sign bit. We need to 205 * do this because we will use size to force range checks. 206 */ 207 if ((error = vnode_size(vp, (off_t *)&size, context))) { 208 LOCKF_DEBUG(0, "lf_advlock: vnode_getattr failed: %d\n", error); 209 return (error); 210 } 211 212 if (size > OFF_MAX || 213 (fl->l_start > 0 && 214 size > (u_quad_t)(OFF_MAX - fl->l_start))) 215 return (EOVERFLOW); 216 start = size + fl->l_start; 217 break; 218 219 default: 220 LOCKF_DEBUG(0, "lf_advlock: unknown whence %d\n", fl->l_whence); 221 return (EINVAL); 222 } 223 if (start < 0) { 224 LOCKF_DEBUG(0, "lf_advlock: start < 0 (%qd)\n", start); 225 return (EINVAL); 226 } 227 if (fl->l_len < 0) { 228 if (start == 0) { 229 LOCKF_DEBUG(0, "lf_advlock: len < 0 & start == 0\n"); 230 return (EINVAL); 231 } 232 end = start - 1; 233 start += fl->l_len; 234 if (start < 0) { 235 LOCKF_DEBUG(0, "lf_advlock: start < 0 (%qd)\n", start); 236 return (EINVAL); 237 } 238 } else if (fl->l_len == 0) 239 end = -1; 240 else { 241 oadd = fl->l_len - 1; 242 if (oadd > (off_t)(OFF_MAX - start)) { 243 LOCKF_DEBUG(0, "lf_advlock: overflow\n"); 244 return (EOVERFLOW); 245 } 246 end = start + oadd; 247 } 248 /* 249 * Create the lockf structure 250 */ 251 MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK); 252 if (lock == NULL) 253 return (ENOLCK); 254 lock->lf_start = start; 255 lock->lf_end = end; 256 lock->lf_id = ap->a_id; 257 lock->lf_vnode = vp; 258 lock->lf_type = fl->l_type; 259 lock->lf_head = head; 260 lock->lf_next = (struct lockf *)0; 261 TAILQ_INIT(&lock->lf_blkhd); 262 lock->lf_flags = ap->a_flags; 263 264 if (ap->a_flags & F_FLOCK) 265 lock->lf_flags |= F_WAKE1_SAFE; 266 267 lck_mtx_lock(&vp->v_lock); /* protect the lockf list */ 268 /* 269 * Do the requested operation. 270 */ 271 switch(ap->a_op) { 272 case F_SETLK: 273 error = lf_setlock(lock); 274 break; 275 276 case F_UNLCK: 277 error = lf_clearlock(lock); 278 FREE(lock, M_LOCKF); 279 break; 280 281 case F_GETLK: 282 error = lf_getlock(lock, fl, -1); 283 FREE(lock, M_LOCKF); 284 break; 285 286#if CONFIG_EMBEDDED 287 case F_GETLKPID: 288 error = lf_getlock(lock, fl, fl->l_pid); 289 FREE(lock, M_LOCKF); 290 break; 291#endif 292 293 default: 294 FREE(lock, M_LOCKF); 295 error = EINVAL; 296 break; 297 } 298 lck_mtx_unlock(&vp->v_lock); /* done manipulating the list */ 299 300 LOCKF_DEBUG(0, "lf_advlock: normal exit: %d\n\n", error); 301 return (error); 302} 303 304/* 305 * Empty the queue of msleeping requests for a lock on the given vnode. 306 * Called with the vnode already locked. Used for forced unmount, where 307 * a flock(2) invoker sleeping on a blocked lock holds an iocount reference 308 * that prevents the vnode from ever being drained. Force unmounting wins. 309 */ 310void 311lf_abort_advlocks(vnode_t vp) 312{ 313 struct lockf *lock; 314 315 if ((lock = vp->v_lockf) == NULL) 316 return; 317 318 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); 319 320 if (!TAILQ_EMPTY(&lock->lf_blkhd)) { 321 struct lockf *tlock; 322 323 TAILQ_FOREACH(tlock, &lock->lf_blkhd, lf_block) { 324 /* 325 * Setting this flag should cause all 326 * currently blocked F_SETLK request to 327 * return to userland with an errno. 328 */ 329 tlock->lf_flags |= F_ABORT; 330 } 331 lf_wakelock(lock, TRUE); 332 } 333} 334 335/* 336 * Take any lock attempts which are currently blocked by a given lock ("from") 337 * and mark them as blocked by a different lock ("to"). Used in the case 338 * where a byte range currently occupied by "from" is to be occupied by "to." 339 */ 340static void 341lf_move_blocked(struct lockf *to, struct lockf *from) 342{ 343 struct lockf *tlock; 344 345 TAILQ_FOREACH(tlock, &from->lf_blkhd, lf_block) { 346 tlock->lf_next = to; 347 } 348 349 TAILQ_CONCAT(&to->lf_blkhd, &from->lf_blkhd, lf_block); 350} 351 352/* 353 * lf_coalesce_adjacent 354 * 355 * Description: Helper function: when setting a lock, coalesce adjacent 356 * locks. Needed because adjacent locks are not overlapping, 357 * but POSIX requires that they be coalesced. 358 * 359 * Parameters: lock The new lock which may be adjacent 360 * to already locked regions, and which 361 * should therefore be coalesced with them 362 * 363 * Returns: <void> 364 */ 365static void 366lf_coalesce_adjacent(struct lockf *lock) 367{ 368 struct lockf **lf = lock->lf_head; 369 370 while (*lf != NOLOCKF) { 371 /* reject locks that obviously could not be coalesced */ 372 if ((*lf == lock) || 373 ((*lf)->lf_id != lock->lf_id) || 374 ((*lf)->lf_type != lock->lf_type)) { 375 lf = &(*lf)->lf_next; 376 continue; 377 } 378 379 /* 380 * NOTE: Assumes that if two locks are adjacent on the number line 381 * and belong to the same owner, then they are adjacent on the list. 382 */ 383 if ((*lf)->lf_end != -1 && 384 ((*lf)->lf_end + 1) == lock->lf_start) { 385 struct lockf *adjacent = *lf; 386 387 LOCKF_DEBUG(0, "lf_coalesce_adjacent: coalesce adjacent previous\n"); 388 lock->lf_start = (*lf)->lf_start; 389 *lf = lock; 390 lf = &(*lf)->lf_next; 391 392 lf_move_blocked(lock, adjacent); 393 394 FREE(adjacent, M_LOCKF); 395 continue; 396 } 397 /* If the lock starts adjacent to us, we can coalesce it */ 398 if (lock->lf_end != -1 && 399 (lock->lf_end + 1) == (*lf)->lf_start) { 400 struct lockf *adjacent = *lf; 401 402 LOCKF_DEBUG(0, "lf_coalesce_adjacent: coalesce adjacent following\n"); 403 lock->lf_end = (*lf)->lf_end; 404 lock->lf_next = (*lf)->lf_next; 405 lf = &lock->lf_next; 406 407 lf_move_blocked(lock, adjacent); 408 409 FREE(adjacent, M_LOCKF); 410 continue; 411 } 412 413 /* no matching conditions; go on to next lock */ 414 lf = &(*lf)->lf_next; 415 } 416} 417 418 419/* 420 * lf_setlock 421 * 422 * Description: Set a byte-range lock. 423 * 424 * Parameters: lock The lock structure describing the lock 425 * to be set; allocated by the caller, it 426 * will be linked into the lock list if 427 * the set is successful, and freed if the 428 * set is unsuccessful. 429 * 430 * Returns: 0 Success 431 * EAGAIN 432 * EDEADLK 433 * lf_split:ENOLCK 434 * lf_clearlock:ENOLCK 435 * msleep:EINTR 436 * 437 * Notes: We add the lock to the provisional lock list. We do not 438 * coalesce at this time; this has implications for other lock 439 * requestors in the blocker search mechanism. 440 */ 441static int 442lf_setlock(struct lockf *lock) 443{ 444 struct lockf *block; 445 struct lockf **head = lock->lf_head; 446 struct lockf **prev, *overlap, *ltmp; 447 static char lockstr[] = "lockf"; 448 int priority, needtolink, error; 449 struct vnode *vp = lock->lf_vnode; 450 overlap_t ovcase; 451 452#ifdef LOCKF_DEBUGGING 453 if (lockf_debug & 1) { 454 lf_print("lf_setlock", lock); 455 lf_printlist("lf_setlock(in)", lock); 456 } 457#endif /* LOCKF_DEBUGGING */ 458 459 /* 460 * Set the priority 461 */ 462 priority = PLOCK; 463 if (lock->lf_type == F_WRLCK) 464 priority += 4; 465 priority |= PCATCH; 466 /* 467 * Scan lock list for this file looking for locks that would block us. 468 */ 469 while ((block = lf_getblock(lock, -1))) { 470 /* 471 * Free the structure and return if nonblocking. 472 */ 473 if ((lock->lf_flags & F_WAIT) == 0) { 474 FREE(lock, M_LOCKF); 475 return (EAGAIN); 476 } 477 478 /* 479 * We are blocked. Since flock style locks cover 480 * the whole file, there is no chance for deadlock. 481 * For byte-range locks we must check for deadlock. 482 * 483 * Deadlock detection is done by looking through the 484 * wait channels to see if there are any cycles that 485 * involve us. MAXDEPTH is set just to make sure we 486 * do not go off into neverland. 487 */ 488 if ((lock->lf_flags & F_POSIX) && 489 (block->lf_flags & F_POSIX)) { 490 struct proc *wproc, *bproc; 491 struct uthread *ut; 492 struct lockf *waitblock; 493 int i = 0; 494 495 /* The block is waiting on something */ 496 wproc = (struct proc *)block->lf_id; 497 proc_lock(wproc); 498 TAILQ_FOREACH(ut, &wproc->p_uthlist, uu_list) { 499 /* 500 * While the thread is asleep (uu_wchan != 0) 501 * in this code (uu_wmesg == lockstr) 502 * and we have not exceeded the maximum cycle 503 * depth (i < maxlockdepth), then check for a 504 * cycle to see if the lock is blocked behind 505 * someone blocked behind us. 506 */ 507 while (((waitblock = (struct lockf *)ut->uu_wchan) != NULL) && 508 ut->uu_wmesg == lockstr && 509 (i++ < maxlockdepth)) { 510 waitblock = (struct lockf *)ut->uu_wchan; 511 /* 512 * Get the lock blocking the lock 513 * which would block us, and make 514 * certain it hasn't come unblocked 515 * (been granted, e.g. between the time 516 * we called lf_getblock, and the time 517 * we successfully acquired the 518 * proc_lock). 519 */ 520 waitblock = waitblock->lf_next; 521 if (waitblock == NULL) 522 break; 523 524 /* 525 * Make sure it's an advisory range 526 * lock and not an overall file lock; 527 * if we mix lock types, it's our own 528 * fault. 529 */ 530 if ((waitblock->lf_flags & F_POSIX) == 0) 531 break; 532 533 /* 534 * If the owner of the lock that's 535 * blocking a lock that's blocking us 536 * getting the requested lock, then we 537 * would deadlock, so error out. 538 */ 539 bproc = (struct proc *)waitblock->lf_id; 540 if (bproc == (struct proc *)lock->lf_id) { 541 proc_unlock(wproc); 542 FREE(lock, M_LOCKF); 543 return (EDEADLK); 544 } 545 } 546 } 547 proc_unlock(wproc); 548 } 549 550 /* 551 * For flock type locks, we must first remove 552 * any shared locks that we hold before we sleep 553 * waiting for an exclusive lock. 554 */ 555 if ((lock->lf_flags & F_FLOCK) && 556 lock->lf_type == F_WRLCK) { 557 lock->lf_type = F_UNLCK; 558 if ((error = lf_clearlock(lock)) != 0) { 559 FREE(lock, M_LOCKF); 560 return (error); 561 } 562 lock->lf_type = F_WRLCK; 563 } 564 /* 565 * Add our lock to the blocked list and sleep until we're free. 566 * Remember who blocked us (for deadlock detection). 567 */ 568 lock->lf_next = block; 569 TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block); 570 571 if ( !(lock->lf_flags & F_FLOCK)) 572 block->lf_flags &= ~F_WAKE1_SAFE; 573 574#ifdef LOCKF_DEBUGGING 575 if (lockf_debug & 1) { 576 lf_print("lf_setlock: blocking on", block); 577 lf_printlist("lf_setlock(block)", block); 578 } 579#endif /* LOCKF_DEBUGGING */ 580 error = msleep(lock, &vp->v_lock, priority, lockstr, 0); 581 582 if (error == 0 && (lock->lf_flags & F_ABORT) != 0) 583 error = EBADF; 584 585 if (lock->lf_next) { 586 /* 587 * lf_wakelock() always sets wakelock->lf_next to 588 * NULL before a wakeup; so we've been woken early 589 * - perhaps by a debugger, signal or other event. 590 * 591 * Remove 'lock' from the block list (avoids double-add 592 * in the spurious case, which would create a cycle) 593 */ 594 TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block); 595 lock->lf_next = NULL; 596 597 if (error == 0) { 598 /* 599 * If this was a spurious wakeup, retry 600 */ 601 printf("%s: spurious wakeup, retrying lock\n", 602 __func__); 603 continue; 604 } 605 } 606 607 if (!TAILQ_EMPTY(&lock->lf_blkhd)) { 608 if ((block = lf_getblock(lock, -1)) != NULL) 609 lf_move_blocked(block, lock); 610 } 611 612 if (error) { 613 if (!TAILQ_EMPTY(&lock->lf_blkhd)) 614 lf_wakelock(lock, TRUE); 615 FREE(lock, M_LOCKF); 616 return (error); 617 } 618 } 619 620 /* 621 * No blocks!! Add the lock. Note that we will 622 * downgrade or upgrade any overlapping locks this 623 * process already owns. 624 * 625 * Skip over locks owned by other processes. 626 * Handle any locks that overlap and are owned by ourselves. 627 */ 628 prev = head; 629 block = *head; 630 needtolink = 1; 631 for (;;) { 632 ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap); 633 if (ovcase) 634 block = overlap->lf_next; 635 /* 636 * Six cases: 637 * 0) no overlap 638 * 1) overlap == lock 639 * 2) overlap contains lock 640 * 3) lock contains overlap 641 * 4) overlap starts before lock 642 * 5) overlap ends after lock 643 */ 644 switch (ovcase) { 645 case OVERLAP_NONE: 646 if (needtolink) { 647 *prev = lock; 648 lock->lf_next = overlap; 649 } 650 break; 651 652 case OVERLAP_EQUALS_LOCK: 653 /* 654 * If downgrading lock, others may be 655 * able to acquire it. 656 */ 657 if (lock->lf_type == F_RDLCK && 658 overlap->lf_type == F_WRLCK) 659 lf_wakelock(overlap, TRUE); 660 overlap->lf_type = lock->lf_type; 661 FREE(lock, M_LOCKF); 662 lock = overlap; /* for lf_coalesce_adjacent() */ 663 break; 664 665 case OVERLAP_CONTAINS_LOCK: 666 /* 667 * Check for common starting point and different types. 668 */ 669 if (overlap->lf_type == lock->lf_type) { 670 FREE(lock, M_LOCKF); 671 lock = overlap; /* for lf_coalesce_adjacent() */ 672 break; 673 } 674 if (overlap->lf_start == lock->lf_start) { 675 *prev = lock; 676 lock->lf_next = overlap; 677 overlap->lf_start = lock->lf_end + 1; 678 } else { 679 /* 680 * If we can't split the lock, we can't 681 * grant it. Claim a system limit for the 682 * resource shortage. 683 */ 684 if (lf_split(overlap, lock)) { 685 FREE(lock, M_LOCKF); 686 return (ENOLCK); 687 } 688 } 689 lf_wakelock(overlap, TRUE); 690 break; 691 692 case OVERLAP_CONTAINED_BY_LOCK: 693 /* 694 * If downgrading lock, others may be able to 695 * acquire it, otherwise take the list. 696 */ 697 if (lock->lf_type == F_RDLCK && 698 overlap->lf_type == F_WRLCK) { 699 lf_wakelock(overlap, TRUE); 700 } else { 701 while (!TAILQ_EMPTY(&overlap->lf_blkhd)) { 702 ltmp = TAILQ_FIRST(&overlap->lf_blkhd); 703 TAILQ_REMOVE(&overlap->lf_blkhd, ltmp, 704 lf_block); 705 TAILQ_INSERT_TAIL(&lock->lf_blkhd, 706 ltmp, lf_block); 707 ltmp->lf_next = lock; 708 } 709 } 710 /* 711 * Add the new lock if necessary and delete the overlap. 712 */ 713 if (needtolink) { 714 *prev = lock; 715 lock->lf_next = overlap->lf_next; 716 prev = &lock->lf_next; 717 needtolink = 0; 718 } else 719 *prev = overlap->lf_next; 720 FREE(overlap, M_LOCKF); 721 continue; 722 723 case OVERLAP_STARTS_BEFORE_LOCK: 724 /* 725 * Add lock after overlap on the list. 726 */ 727 lock->lf_next = overlap->lf_next; 728 overlap->lf_next = lock; 729 overlap->lf_end = lock->lf_start - 1; 730 prev = &lock->lf_next; 731 lf_wakelock(overlap, TRUE); 732 needtolink = 0; 733 continue; 734 735 case OVERLAP_ENDS_AFTER_LOCK: 736 /* 737 * Add the new lock before overlap. 738 */ 739 if (needtolink) { 740 *prev = lock; 741 lock->lf_next = overlap; 742 } 743 overlap->lf_start = lock->lf_end + 1; 744 lf_wakelock(overlap, TRUE); 745 break; 746 } 747 break; 748 } 749 /* Coalesce adjacent locks with identical attributes */ 750 lf_coalesce_adjacent(lock); 751#ifdef LOCKF_DEBUGGING 752 if (lockf_debug & 1) { 753 lf_print("lf_setlock: got the lock", lock); 754 lf_printlist("lf_setlock(out)", lock); 755 } 756#endif /* LOCKF_DEBUGGING */ 757 return (0); 758} 759 760 761/* 762 * lf_clearlock 763 * 764 * Description: Remove a byte-range lock on an vnode. Generally, find the 765 * lock (or an overlap to that lock) and remove it (or shrink 766 * it), then wakeup anyone we can. 767 * 768 * Parameters: unlock The lock to clear 769 * 770 * Returns: 0 Success 771 * lf_split:ENOLCK 772 * 773 * Notes: A caller may unlock all the locks owned by the caller by 774 * specifying the entire file range; locks owned by other 775 * callers are not effected by this operation. 776 */ 777static int 778lf_clearlock(struct lockf *unlock) 779{ 780 struct lockf **head = unlock->lf_head; 781 struct lockf *lf = *head; 782 struct lockf *overlap, **prev; 783 overlap_t ovcase; 784 785 if (lf == NOLOCKF) 786 return (0); 787#ifdef LOCKF_DEBUGGING 788 if (unlock->lf_type != F_UNLCK) 789 panic("lf_clearlock: bad type"); 790 if (lockf_debug & 1) 791 lf_print("lf_clearlock", unlock); 792#endif /* LOCKF_DEBUGGING */ 793 prev = head; 794 while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap)) != OVERLAP_NONE) { 795 /* 796 * Wakeup the list of locks to be retried. 797 */ 798 lf_wakelock(overlap, FALSE); 799 800 switch (ovcase) { 801 case OVERLAP_NONE: /* satisfy compiler enum/switch */ 802 break; 803 804 case OVERLAP_EQUALS_LOCK: 805 *prev = overlap->lf_next; 806 FREE(overlap, M_LOCKF); 807 break; 808 809 case OVERLAP_CONTAINS_LOCK: /* split it */ 810 if (overlap->lf_start == unlock->lf_start) { 811 overlap->lf_start = unlock->lf_end + 1; 812 break; 813 } 814 /* 815 * If we can't split the lock, we can't grant it. 816 * Claim a system limit for the resource shortage. 817 */ 818 if (lf_split(overlap, unlock)) 819 return (ENOLCK); 820 overlap->lf_next = unlock->lf_next; 821 break; 822 823 case OVERLAP_CONTAINED_BY_LOCK: 824 *prev = overlap->lf_next; 825 lf = overlap->lf_next; 826 FREE(overlap, M_LOCKF); 827 continue; 828 829 case OVERLAP_STARTS_BEFORE_LOCK: 830 overlap->lf_end = unlock->lf_start - 1; 831 prev = &overlap->lf_next; 832 lf = overlap->lf_next; 833 continue; 834 835 case OVERLAP_ENDS_AFTER_LOCK: 836 overlap->lf_start = unlock->lf_end + 1; 837 break; 838 } 839 break; 840 } 841#ifdef LOCKF_DEBUGGING 842 if (lockf_debug & 1) 843 lf_printlist("lf_clearlock", unlock); 844#endif /* LOCKF_DEBUGGING */ 845 return (0); 846} 847 848 849/* 850 * lf_getlock 851 * 852 * Description: Check whether there is a blocking lock, and if so return 853 * its process identifier into the lock being requested. 854 * 855 * Parameters: lock Pointer to lock to test for blocks 856 * fl Pointer to flock structure to receive 857 * the blocking lock information, if a 858 * blocking lock is found. 859 * matchpid -1, or pid value to match in lookup. 860 * 861 * Returns: 0 Success 862 * 863 * Implicit Returns: 864 * *fl Contents modified to reflect the 865 * blocking lock, if one is found; not 866 * modified otherwise 867 * 868 * Notes: fl->l_pid will be (-1) for file locks and will only be set to 869 * the blocking process ID for advisory record locks. 870 */ 871static int 872lf_getlock(struct lockf *lock, struct flock *fl, pid_t matchpid) 873{ 874 struct lockf *block; 875 876#ifdef LOCKF_DEBUGGING 877 if (lockf_debug & 1) 878 lf_print("lf_getlock", lock); 879#endif /* LOCKF_DEBUGGING */ 880 881 if ((block = lf_getblock(lock, matchpid))) { 882 fl->l_type = block->lf_type; 883 fl->l_whence = SEEK_SET; 884 fl->l_start = block->lf_start; 885 if (block->lf_end == -1) 886 fl->l_len = 0; 887 else 888 fl->l_len = block->lf_end - block->lf_start + 1; 889 if (block->lf_flags & F_POSIX) 890 fl->l_pid = proc_pid((struct proc *)(block->lf_id)); 891 else 892 fl->l_pid = -1; 893 } else { 894 fl->l_type = F_UNLCK; 895 } 896 return (0); 897} 898 899/* 900 * lf_getblock 901 * 902 * Description: Walk the list of locks for an inode and return the first 903 * blocking lock. A lock is considered blocking if we are not 904 * the lock owner; otherwise, we are permitted to upgrade or 905 * downgrade it, and it's not considered blocking. 906 * 907 * Parameters: lock The lock for which we are interested 908 * in obtaining the blocking lock, if any 909 * matchpid -1, or pid value to match in lookup. 910 * 911 * Returns: NOLOCKF No blocking lock exists 912 * !NOLOCKF The address of the blocking lock's 913 * struct lockf. 914 */ 915static struct lockf * 916lf_getblock(struct lockf *lock, pid_t matchpid) 917{ 918 struct lockf **prev, *overlap, *lf = *(lock->lf_head); 919 920 for (prev = lock->lf_head; 921 lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != OVERLAP_NONE; 922 lf = overlap->lf_next) { 923 /* 924 * Found an overlap. 925 * 926 * If we're matching pids, and it's a record lock, 927 * but the pid doesn't match, then keep on looking .. 928 */ 929 if (matchpid != -1 && 930 (overlap->lf_flags & F_POSIX) != 0 && 931 proc_pid((struct proc *)(overlap->lf_id)) != matchpid) 932 continue; 933 /* 934 * does it block us? 935 */ 936 if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK)) 937 return (overlap); 938 } 939 return (NOLOCKF); 940} 941 942 943/* 944 * lf_findoverlap 945 * 946 * Description: Walk the list of locks to find an overlapping lock (if any). 947 * 948 * Parameters: lf First lock on lock list 949 * lock The lock we are checking for an overlap 950 * check Check type 951 * prev pointer to pointer pointer to contain 952 * address of pointer to previous lock 953 * pointer to overlapping lock, if overlap 954 * overlap pointer to pointer to contain address 955 * of overlapping lock 956 * 957 * Returns: OVERLAP_NONE 958 * OVERLAP_EQUALS_LOCK 959 * OVERLAP_CONTAINS_LOCK 960 * OVERLAP_CONTAINED_BY_LOCK 961 * OVERLAP_STARTS_BEFORE_LOCK 962 * OVERLAP_ENDS_AFTER_LOCK 963 * 964 * Implicit Returns: 965 * *prev The address of the next pointer in the 966 * lock previous to the overlapping lock; 967 * this is generally used to relink the 968 * lock list, avoiding a second iteration. 969 * *overlap The pointer to the overlapping lock 970 * itself; this is used to return data in 971 * the check == OTHERS case, and for the 972 * caller to modify the overlapping lock, 973 * in the check == SELF case 974 * 975 * Note: This returns only the FIRST overlapping lock. There may be 976 * more than one. lf_getlock will return the first blocking lock, 977 * while lf_setlock will iterate over all overlapping locks to 978 * 979 * The check parameter can be SELF, meaning we are looking for 980 * overlapping locks owned by us, or it can be OTHERS, meaning 981 * we are looking for overlapping locks owned by someone else so 982 * we can report a blocking lock on an F_GETLK request. 983 * 984 * The value of *overlap and *prev are modified, even if there is 985 * no overlapping lock found; always check the return code. 986 */ 987static overlap_t 988lf_findoverlap(struct lockf *lf, struct lockf *lock, int type, 989 struct lockf ***prev, struct lockf **overlap) 990{ 991 off_t start, end; 992 int found_self = 0; 993 994 *overlap = lf; 995 if (lf == NOLOCKF) 996 return (0); 997#ifdef LOCKF_DEBUGGING 998 if (lockf_debug & 2) 999 lf_print("lf_findoverlap: looking for overlap in", lock); 1000#endif /* LOCKF_DEBUGGING */ 1001 start = lock->lf_start; 1002 end = lock->lf_end; 1003 while (lf != NOLOCKF) { 1004 if (((type & SELF) && lf->lf_id != lock->lf_id) || 1005 ((type & OTHERS) && lf->lf_id == lock->lf_id)) { 1006 /* 1007 * Locks belonging to one process are adjacent on the 1008 * list, so if we've found any locks belonging to us, 1009 * and we're now seeing something else, then we've 1010 * examined all "self" locks. Note that bailing out 1011 * here is quite important; for coalescing, we assume 1012 * numerically adjacent locks from the same owner to 1013 * be adjacent on the list. 1014 */ 1015 if ((type & SELF) && found_self) { 1016 return OVERLAP_NONE; 1017 } 1018 1019 *prev = &lf->lf_next; 1020 *overlap = lf = lf->lf_next; 1021 continue; 1022 } 1023 1024 if ((type & SELF)) { 1025 found_self = 1; 1026 } 1027 1028#ifdef LOCKF_DEBUGGING 1029 if (lockf_debug & 2) 1030 lf_print("\tchecking", lf); 1031#endif /* LOCKF_DEBUGGING */ 1032 /* 1033 * OK, check for overlap 1034 */ 1035 if ((lf->lf_end != -1 && start > lf->lf_end) || 1036 (end != -1 && lf->lf_start > end)) { 1037 /* Case 0 */ 1038 LOCKF_DEBUG(2, "no overlap\n"); 1039 1040 /* 1041 * NOTE: assumes that locks for the same process are 1042 * nonintersecting and ordered. 1043 */ 1044 if ((type & SELF) && end != -1 && lf->lf_start > end) 1045 return (OVERLAP_NONE); 1046 *prev = &lf->lf_next; 1047 *overlap = lf = lf->lf_next; 1048 continue; 1049 } 1050 if ((lf->lf_start == start) && (lf->lf_end == end)) { 1051 LOCKF_DEBUG(2, "overlap == lock\n"); 1052 return (OVERLAP_EQUALS_LOCK); 1053 } 1054 if ((lf->lf_start <= start) && 1055 (end != -1) && 1056 ((lf->lf_end >= end) || (lf->lf_end == -1))) { 1057 LOCKF_DEBUG(2, "overlap contains lock\n"); 1058 return (OVERLAP_CONTAINS_LOCK); 1059 } 1060 if (start <= lf->lf_start && 1061 (end == -1 || 1062 (lf->lf_end != -1 && end >= lf->lf_end))) { 1063 LOCKF_DEBUG(2, "lock contains overlap\n"); 1064 return (OVERLAP_CONTAINED_BY_LOCK); 1065 } 1066 if ((lf->lf_start < start) && 1067 ((lf->lf_end >= start) || (lf->lf_end == -1))) { 1068 LOCKF_DEBUG(2, "overlap starts before lock\n"); 1069 return (OVERLAP_STARTS_BEFORE_LOCK); 1070 } 1071 if ((lf->lf_start > start) && 1072 (end != -1) && 1073 ((lf->lf_end > end) || (lf->lf_end == -1))) { 1074 LOCKF_DEBUG(2, "overlap ends after lock\n"); 1075 return (OVERLAP_ENDS_AFTER_LOCK); 1076 } 1077 panic("lf_findoverlap: default"); 1078 } 1079 return (OVERLAP_NONE); 1080} 1081 1082 1083/* 1084 * lf_split 1085 * 1086 * Description: Split a lock and a contained region into two or three locks 1087 * as necessary. 1088 * 1089 * Parameters: lock1 Lock to split 1090 * lock2 Overlapping lock region requiring the 1091 * split (upgrade/downgrade/unlock) 1092 * 1093 * Returns: 0 Success 1094 * ENOLCK No memory for new lock 1095 * 1096 * Implicit Returns: 1097 * *lock1 Modified original lock 1098 * *lock2 Overlapping lock (inserted into list) 1099 * (new lock) Potential new lock inserted into list 1100 * if split results in 3 locks 1101 * 1102 * Notes: This operation can only fail if the split would result in three 1103 * locks, and there is insufficient memory to allocate the third 1104 * lock; in that case, neither of the locks will be modified. 1105 */ 1106static int 1107lf_split(struct lockf *lock1, struct lockf *lock2) 1108{ 1109 struct lockf *splitlock; 1110 1111#ifdef LOCKF_DEBUGGING 1112 if (lockf_debug & 2) { 1113 lf_print("lf_split", lock1); 1114 lf_print("splitting from", lock2); 1115 } 1116#endif /* LOCKF_DEBUGGING */ 1117 /* 1118 * Check to see if spliting into only two pieces. 1119 */ 1120 if (lock1->lf_start == lock2->lf_start) { 1121 lock1->lf_start = lock2->lf_end + 1; 1122 lock2->lf_next = lock1; 1123 return (0); 1124 } 1125 if (lock1->lf_end == lock2->lf_end) { 1126 lock1->lf_end = lock2->lf_start - 1; 1127 lock2->lf_next = lock1->lf_next; 1128 lock1->lf_next = lock2; 1129 return (0); 1130 } 1131 /* 1132 * Make a new lock consisting of the last part of 1133 * the encompassing lock 1134 */ 1135 MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK); 1136 if (splitlock == NULL) 1137 return (ENOLCK); 1138 bcopy(lock1, splitlock, sizeof *splitlock); 1139 splitlock->lf_start = lock2->lf_end + 1; 1140 TAILQ_INIT(&splitlock->lf_blkhd); 1141 lock1->lf_end = lock2->lf_start - 1; 1142 /* 1143 * OK, now link it in 1144 */ 1145 splitlock->lf_next = lock1->lf_next; 1146 lock2->lf_next = splitlock; 1147 lock1->lf_next = lock2; 1148 1149 return (0); 1150} 1151 1152 1153/* 1154 * lf_wakelock 1155 * 1156 * Wakeup a blocklist in the case of a downgrade or unlock, since others 1157 * waiting on the lock may now be able to acquire it. 1158 * 1159 * Parameters: listhead Lock list head on which waiters may 1160 * have pending locks 1161 * 1162 * Returns: <void> 1163 * 1164 * Notes: This function iterates a list of locks and wakes all waiters, 1165 * rather than only waiters for the contended regions. Because 1166 * of this, for heavily contended files, this can result in a 1167 * "thundering herd" situation. Refactoring the code could make 1168 * this operation more efficient, if heavy contention ever results 1169 * in a real-world performance problem. 1170 */ 1171static void 1172lf_wakelock(struct lockf *listhead, boolean_t force_all) 1173{ 1174 struct lockf *wakelock; 1175 boolean_t wake_all = TRUE; 1176 1177 if (force_all == FALSE && (listhead->lf_flags & F_WAKE1_SAFE)) 1178 wake_all = FALSE; 1179 1180 while (!TAILQ_EMPTY(&listhead->lf_blkhd)) { 1181 wakelock = TAILQ_FIRST(&listhead->lf_blkhd); 1182 TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block); 1183 1184 wakelock->lf_next = NOLOCKF; 1185#ifdef LOCKF_DEBUGGING 1186 if (lockf_debug & 2) 1187 lf_print("lf_wakelock: awakening", wakelock); 1188#endif /* LOCKF_DEBUGGING */ 1189 if (wake_all == FALSE) { 1190 /* 1191 * If there are items on the list head block list, 1192 * move them to the wakelock list instead, and then 1193 * correct their lf_next pointers. 1194 */ 1195 if (!TAILQ_EMPTY(&listhead->lf_blkhd)) { 1196 TAILQ_CONCAT(&wakelock->lf_blkhd, &listhead->lf_blkhd, lf_block); 1197 1198 struct lockf *tlock; 1199 1200 TAILQ_FOREACH(tlock, &wakelock->lf_blkhd, lf_block) { 1201 if (TAILQ_NEXT(tlock, lf_block) == tlock) { 1202 /* See rdar://10887303 */ 1203 panic("cycle in wakelock list"); 1204 } 1205 tlock->lf_next = wakelock; 1206 } 1207 } 1208 } 1209 wakeup(wakelock); 1210 1211 if (wake_all == FALSE) 1212 break; 1213 } 1214} 1215 1216 1217#ifdef LOCKF_DEBUGGING 1218/* 1219 * lf_print DEBUG 1220 * 1221 * Print out a lock; lock information is prefixed by the string in 'tag' 1222 * 1223 * Parameters: tag A string tag for debugging 1224 * lock The lock whose information should be 1225 * displayed 1226 * 1227 * Returns: <void> 1228 */ 1229void 1230lf_print(const char *tag, struct lockf *lock) 1231{ 1232 printf("%s: lock %p for ", tag, (void *)lock); 1233 if (lock->lf_flags & F_POSIX) 1234 printf("proc %ld", (long)((struct proc *)lock->lf_id)->p_pid); 1235 else 1236 printf("id %p", (void *)lock->lf_id); 1237 if (lock->lf_vnode != 0) 1238 printf(" in vno %p, %s, start 0x%016llx, end 0x%016llx", 1239 lock->lf_vnode, 1240 lock->lf_type == F_RDLCK ? "shared" : 1241 lock->lf_type == F_WRLCK ? "exclusive" : 1242 lock->lf_type == F_UNLCK ? "unlock" : "unknown", 1243 (intmax_t)lock->lf_start, (intmax_t)lock->lf_end); 1244 else 1245 printf(" %s, start 0x%016llx, end 0x%016llx", 1246 lock->lf_type == F_RDLCK ? "shared" : 1247 lock->lf_type == F_WRLCK ? "exclusive" : 1248 lock->lf_type == F_UNLCK ? "unlock" : "unknown", 1249 (intmax_t)lock->lf_start, (intmax_t)lock->lf_end); 1250 if (!TAILQ_EMPTY(&lock->lf_blkhd)) 1251 printf(" block %p\n", (void *)TAILQ_FIRST(&lock->lf_blkhd)); 1252 else 1253 printf("\n"); 1254} 1255 1256 1257/* 1258 * lf_printlist DEBUG 1259 * 1260 * Print out a lock list for the vnode associated with 'lock'; lock information 1261 * is prefixed by the string in 'tag' 1262 * 1263 * Parameters: tag A string tag for debugging 1264 * lock The lock whose vnode's lock list should 1265 * be displayed 1266 * 1267 * Returns: <void> 1268 */ 1269void 1270lf_printlist(const char *tag, struct lockf *lock) 1271{ 1272 struct lockf *lf, *blk; 1273 1274 if (lock->lf_vnode == 0) 1275 return; 1276 1277 printf("%s: Lock list for vno %p:\n", 1278 tag, lock->lf_vnode); 1279 for (lf = lock->lf_vnode->v_lockf; lf; lf = lf->lf_next) { 1280 printf("\tlock %p for ",(void *)lf); 1281 if (lf->lf_flags & F_POSIX) 1282 printf("proc %ld", 1283 (long)((struct proc *)lf->lf_id)->p_pid); 1284 else 1285 printf("id %p", (void *)lf->lf_id); 1286 printf(", %s, start 0x%016llx, end 0x%016llx", 1287 lf->lf_type == F_RDLCK ? "shared" : 1288 lf->lf_type == F_WRLCK ? "exclusive" : 1289 lf->lf_type == F_UNLCK ? "unlock" : 1290 "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end); 1291 TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) { 1292 printf("\n\t\tlock request %p for ", (void *)blk); 1293 if (blk->lf_flags & F_POSIX) 1294 printf("proc %ld", 1295 (long)((struct proc *)blk->lf_id)->p_pid); 1296 else 1297 printf("id %p", (void *)blk->lf_id); 1298 printf(", %s, start 0x%016llx, end 0x%016llx", 1299 blk->lf_type == F_RDLCK ? "shared" : 1300 blk->lf_type == F_WRLCK ? "exclusive" : 1301 blk->lf_type == F_UNLCK ? "unlock" : 1302 "unknown", (intmax_t)blk->lf_start, 1303 (intmax_t)blk->lf_end); 1304 if (!TAILQ_EMPTY(&blk->lf_blkhd)) 1305 panic("lf_printlist: bad list"); 1306 } 1307 printf("\n"); 1308 } 1309} 1310#endif /* LOCKF_DEBUGGING */ 1311