1/* 2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * @OSF_COPYRIGHT@ 30 */ 31/* 32 * Mach Operating System 33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University 34 * All Rights Reserved. 35 * 36 * Permission to use, copy, modify and distribute this software and its 37 * documentation is hereby granted, provided that both the copyright 38 * notice and this permission notice appear in all copies of the 39 * software, derivative works or modified versions, and any portions 40 * thereof, and that both notices appear in supporting documentation. 41 * 42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR 44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 45 * 46 * Carnegie Mellon requests users of this software to return to 47 * 48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 49 * School of Computer Science 50 * Carnegie Mellon University 51 * Pittsburgh PA 15213-3890 52 * 53 * any improvements or extensions that they make and grant Carnegie Mellon 54 * the rights to redistribute these changes. 55 */ 56/* 57 * File: kern/lock.c 58 * Author: Avadis Tevanian, Jr., Michael Wayne Young 59 * Date: 1985 60 * 61 * Locking primitives implementation 62 */ 63 64#include <mach_ldebug.h> 65 66#include <kern/lock.h> 67#include <kern/locks.h> 68#include <kern/kalloc.h> 69#include <kern/misc_protos.h> 70#include <kern/thread.h> 71#include <kern/processor.h> 72#include <kern/cpu_data.h> 73#include <kern/cpu_number.h> 74#include <kern/sched_prim.h> 75#include <kern/xpr.h> 76#include <kern/debug.h> 77#include <string.h> 78 79#include <i386/machine_routines.h> /* machine_timeout_suspended() */ 80#include <machine/machine_cpu.h> 81#include <i386/mp.h> 82 83#include <sys/kdebug.h> 84#include <mach/branch_predicates.h> 85 86/* 87 * We need only enough declarations from the BSD-side to be able to 88 * test if our probe is active, and to call __dtrace_probe(). Setting 89 * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in. 90 */ 91#if CONFIG_DTRACE 92#define NEED_DTRACE_DEFS 93#include <../bsd/sys/lockstat.h> 94#endif 95 96#define LCK_RW_LCK_EXCLUSIVE_CODE 0x100 97#define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101 98#define LCK_RW_LCK_SHARED_CODE 0x102 99#define LCK_RW_LCK_SH_TO_EX_CODE 0x103 100#define LCK_RW_LCK_SH_TO_EX1_CODE 0x104 101#define LCK_RW_LCK_EX_TO_SH_CODE 0x105 102 103#define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106 104#define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107 105#define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108 106#define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109 107#define LCK_RW_LCK_SHARED_SPIN_CODE 0x110 108#define LCK_RW_LCK_SHARED_WAIT_CODE 0x111 109#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112 110#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113 111 112 113#define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG) 114 115unsigned int LcksOpts=0; 116 117/* Forwards */ 118 119#if USLOCK_DEBUG 120/* 121 * Perform simple lock checks. 122 */ 123int uslock_check = 1; 124int max_lock_loops = 100000000; 125decl_simple_lock_data(extern , printf_lock) 126decl_simple_lock_data(extern , panic_lock) 127#endif /* USLOCK_DEBUG */ 128 129 130/* 131 * We often want to know the addresses of the callers 132 * of the various lock routines. However, this information 133 * is only used for debugging and statistics. 134 */ 135typedef void *pc_t; 136#define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS) 137#define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS) 138#if ANY_LOCK_DEBUG 139#define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC()) 140#define DECL_PC(pc) pc_t pc; 141#else /* ANY_LOCK_DEBUG */ 142#define DECL_PC(pc) 143#ifdef lint 144/* 145 * Eliminate lint complaints about unused local pc variables. 146 */ 147#define OBTAIN_PC(pc) ++pc 148#else /* lint */ 149#define OBTAIN_PC(pc) 150#endif /* lint */ 151#endif /* USLOCK_DEBUG */ 152 153 154/* 155 * Portable lock package implementation of usimple_locks. 156 */ 157 158#if USLOCK_DEBUG 159#define USLDBG(stmt) stmt 160void usld_lock_init(usimple_lock_t, unsigned short); 161void usld_lock_pre(usimple_lock_t, pc_t); 162void usld_lock_post(usimple_lock_t, pc_t); 163void usld_unlock(usimple_lock_t, pc_t); 164void usld_lock_try_pre(usimple_lock_t, pc_t); 165void usld_lock_try_post(usimple_lock_t, pc_t); 166int usld_lock_common_checks(usimple_lock_t, char *); 167#else /* USLOCK_DEBUG */ 168#define USLDBG(stmt) 169#endif /* USLOCK_DEBUG */ 170 171 172extern int lck_rw_grab_want(lck_rw_t *lck); 173extern int lck_rw_grab_shared(lck_rw_t *lck); 174extern int lck_rw_held_read_or_upgrade(lck_rw_t *lck); 175 176 177/* 178 * Forward definitions 179 */ 180 181void lck_rw_lock_shared_gen( 182 lck_rw_t *lck); 183 184void lck_rw_lock_exclusive_gen( 185 lck_rw_t *lck); 186 187boolean_t lck_rw_lock_shared_to_exclusive_success( 188 lck_rw_t *lck); 189 190boolean_t lck_rw_lock_shared_to_exclusive_failure( 191 lck_rw_t *lck, 192 int prior_lock_state); 193 194void lck_rw_lock_exclusive_to_shared_gen( 195 lck_rw_t *lck, 196 int prior_lock_state); 197 198lck_rw_type_t lck_rw_done_gen( 199 lck_rw_t *lck, 200 int prior_lock_state); 201 202void lck_rw_clear_promotions_x86(thread_t thread); 203 204/* 205 * Routine: lck_spin_alloc_init 206 */ 207lck_spin_t * 208lck_spin_alloc_init( 209 lck_grp_t *grp, 210 lck_attr_t *attr) 211{ 212 lck_spin_t *lck; 213 214 if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0) 215 lck_spin_init(lck, grp, attr); 216 217 return(lck); 218} 219 220/* 221 * Routine: lck_spin_free 222 */ 223void 224lck_spin_free( 225 lck_spin_t *lck, 226 lck_grp_t *grp) 227{ 228 lck_spin_destroy(lck, grp); 229 kfree(lck, sizeof(lck_spin_t)); 230} 231 232/* 233 * Routine: lck_spin_init 234 */ 235void 236lck_spin_init( 237 lck_spin_t *lck, 238 lck_grp_t *grp, 239 __unused lck_attr_t *attr) 240{ 241 usimple_lock_init((usimple_lock_t) lck, 0); 242 lck_grp_reference(grp); 243 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN); 244} 245 246/* 247 * Routine: lck_spin_destroy 248 */ 249void 250lck_spin_destroy( 251 lck_spin_t *lck, 252 lck_grp_t *grp) 253{ 254 if (lck->interlock == LCK_SPIN_TAG_DESTROYED) 255 return; 256 lck->interlock = LCK_SPIN_TAG_DESTROYED; 257 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN); 258 lck_grp_deallocate(grp); 259 return; 260} 261 262/* 263 * Routine: lck_spin_lock 264 */ 265void 266lck_spin_lock( 267 lck_spin_t *lck) 268{ 269 usimple_lock((usimple_lock_t) lck); 270} 271 272/* 273 * Routine: lck_spin_unlock 274 */ 275void 276lck_spin_unlock( 277 lck_spin_t *lck) 278{ 279 usimple_unlock((usimple_lock_t) lck); 280} 281 282 283/* 284 * Routine: lck_spin_try_lock 285 */ 286boolean_t 287lck_spin_try_lock( 288 lck_spin_t *lck) 289{ 290 return((boolean_t)usimple_lock_try((usimple_lock_t) lck)); 291} 292 293/* 294 * Initialize a usimple_lock. 295 * 296 * No change in preemption state. 297 */ 298void 299usimple_lock_init( 300 usimple_lock_t l, 301 __unused unsigned short tag) 302{ 303#ifndef MACHINE_SIMPLE_LOCK 304 USLDBG(usld_lock_init(l, tag)); 305 hw_lock_init(&l->interlock); 306#else 307 simple_lock_init((simple_lock_t)l,tag); 308#endif 309} 310 311volatile uint32_t spinlock_owner_cpu = ~0; 312volatile usimple_lock_t spinlock_timed_out; 313 314static uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) { 315 uint64_t deadline; 316 uint32_t i; 317 318 for (i = 0; i < real_ncpus; i++) { 319 if ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr) { 320 spinlock_owner_cpu = i; 321 if ((uint32_t) cpu_number() == i) 322 break; 323 cpu_datap(i)->cpu_NMI_acknowledged = FALSE; 324 cpu_NMI_interrupt(i); 325 deadline = mach_absolute_time() + (LockTimeOut * 2); 326 while (mach_absolute_time() < deadline && cpu_datap(i)->cpu_NMI_acknowledged == FALSE) 327 cpu_pause(); 328 break; 329 } 330 } 331 332 return spinlock_owner_cpu; 333} 334 335/* 336 * Acquire a usimple_lock. 337 * 338 * Returns with preemption disabled. Note 339 * that the hw_lock routines are responsible for 340 * maintaining preemption state. 341 */ 342void 343usimple_lock( 344 usimple_lock_t l) 345{ 346#ifndef MACHINE_SIMPLE_LOCK 347 DECL_PC(pc); 348 349 OBTAIN_PC(pc); 350 USLDBG(usld_lock_pre(l, pc)); 351 352 if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == 0)) { 353 boolean_t uslock_acquired = FALSE; 354 while (machine_timeout_suspended()) { 355 enable_preemption(); 356 if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC))) 357 break; 358 } 359 360 if (uslock_acquired == FALSE) { 361 uint32_t lock_cpu; 362 uintptr_t lowner = (uintptr_t)l->interlock.lock_data; 363 spinlock_timed_out = l; 364 lock_cpu = spinlock_timeout_NMI(lowner); 365 panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx", l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data); 366 } 367 } 368 USLDBG(usld_lock_post(l, pc)); 369#else 370 simple_lock((simple_lock_t)l); 371#endif 372} 373 374 375/* 376 * Release a usimple_lock. 377 * 378 * Returns with preemption enabled. Note 379 * that the hw_lock routines are responsible for 380 * maintaining preemption state. 381 */ 382void 383usimple_unlock( 384 usimple_lock_t l) 385{ 386#ifndef MACHINE_SIMPLE_LOCK 387 DECL_PC(pc); 388 389 OBTAIN_PC(pc); 390 USLDBG(usld_unlock(l, pc)); 391 hw_lock_unlock(&l->interlock); 392#else 393 simple_unlock_rwmb((simple_lock_t)l); 394#endif 395} 396 397 398/* 399 * Conditionally acquire a usimple_lock. 400 * 401 * On success, returns with preemption disabled. 402 * On failure, returns with preemption in the same state 403 * as when first invoked. Note that the hw_lock routines 404 * are responsible for maintaining preemption state. 405 * 406 * XXX No stats are gathered on a miss; I preserved this 407 * behavior from the original assembly-language code, but 408 * doesn't it make sense to log misses? XXX 409 */ 410unsigned int 411usimple_lock_try( 412 usimple_lock_t l) 413{ 414#ifndef MACHINE_SIMPLE_LOCK 415 unsigned int success; 416 DECL_PC(pc); 417 418 OBTAIN_PC(pc); 419 USLDBG(usld_lock_try_pre(l, pc)); 420 if ((success = hw_lock_try(&l->interlock))) { 421 USLDBG(usld_lock_try_post(l, pc)); 422 } 423 return success; 424#else 425 return(simple_lock_try((simple_lock_t)l)); 426#endif 427} 428 429#if USLOCK_DEBUG 430/* 431 * States of a usimple_lock. The default when initializing 432 * a usimple_lock is setting it up for debug checking. 433 */ 434#define USLOCK_CHECKED 0x0001 /* lock is being checked */ 435#define USLOCK_TAKEN 0x0002 /* lock has been taken */ 436#define USLOCK_INIT 0xBAA0 /* lock has been initialized */ 437#define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED) 438#define USLOCK_CHECKING(l) (uslock_check && \ 439 ((l)->debug.state & USLOCK_CHECKED)) 440 441/* 442 * Trace activities of a particularly interesting lock. 443 */ 444void usl_trace(usimple_lock_t, int, pc_t, const char *); 445 446 447/* 448 * Initialize the debugging information contained 449 * in a usimple_lock. 450 */ 451void 452usld_lock_init( 453 usimple_lock_t l, 454 __unused unsigned short tag) 455{ 456 if (l == USIMPLE_LOCK_NULL) 457 panic("lock initialization: null lock pointer"); 458 l->lock_type = USLOCK_TAG; 459 l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0; 460 l->debug.lock_cpu = l->debug.unlock_cpu = 0; 461 l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC; 462 l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD; 463 l->debug.duration[0] = l->debug.duration[1] = 0; 464 l->debug.unlock_cpu = l->debug.unlock_cpu = 0; 465 l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC; 466 l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD; 467} 468 469 470/* 471 * These checks apply to all usimple_locks, not just 472 * those with USLOCK_CHECKED turned on. 473 */ 474int 475usld_lock_common_checks( 476 usimple_lock_t l, 477 char *caller) 478{ 479 if (l == USIMPLE_LOCK_NULL) 480 panic("%s: null lock pointer", caller); 481 if (l->lock_type != USLOCK_TAG) 482 panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type); 483 if (!(l->debug.state & USLOCK_INIT)) 484 panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state); 485 return USLOCK_CHECKING(l); 486} 487 488 489/* 490 * Debug checks on a usimple_lock just before attempting 491 * to acquire it. 492 */ 493/* ARGSUSED */ 494void 495usld_lock_pre( 496 usimple_lock_t l, 497 pc_t pc) 498{ 499 char caller[] = "usimple_lock"; 500 501 502 if (!usld_lock_common_checks(l, caller)) 503 return; 504 505/* 506 * Note that we have a weird case where we are getting a lock when we are] 507 * in the process of putting the system to sleep. We are running with no 508 * current threads, therefore we can't tell if we are trying to retake a lock 509 * we have or someone on the other processor has it. Therefore we just 510 * ignore this test if the locking thread is 0. 511 */ 512 513 if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread && 514 l->debug.lock_thread == (void *) current_thread()) { 515 printf("%s: lock %p already locked (at %p) by", 516 caller, l, l->debug.lock_pc); 517 printf(" current thread %p (new attempt at pc %p)\n", 518 l->debug.lock_thread, pc); 519 panic("%s", caller); 520 } 521 mp_disable_preemption(); 522 usl_trace(l, cpu_number(), pc, caller); 523 mp_enable_preemption(); 524} 525 526 527/* 528 * Debug checks on a usimple_lock just after acquiring it. 529 * 530 * Pre-emption has been disabled at this point, 531 * so we are safe in using cpu_number. 532 */ 533void 534usld_lock_post( 535 usimple_lock_t l, 536 pc_t pc) 537{ 538 register int mycpu; 539 char caller[] = "successful usimple_lock"; 540 541 542 if (!usld_lock_common_checks(l, caller)) 543 return; 544 545 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) 546 panic("%s: lock %p became uninitialized", 547 caller, l); 548 if ((l->debug.state & USLOCK_TAKEN)) 549 panic("%s: lock 0x%p became TAKEN by someone else", 550 caller, l); 551 552 mycpu = cpu_number(); 553 l->debug.lock_thread = (void *)current_thread(); 554 l->debug.state |= USLOCK_TAKEN; 555 l->debug.lock_pc = pc; 556 l->debug.lock_cpu = mycpu; 557 558 usl_trace(l, mycpu, pc, caller); 559} 560 561 562/* 563 * Debug checks on a usimple_lock just before 564 * releasing it. Note that the caller has not 565 * yet released the hardware lock. 566 * 567 * Preemption is still disabled, so there's 568 * no problem using cpu_number. 569 */ 570void 571usld_unlock( 572 usimple_lock_t l, 573 pc_t pc) 574{ 575 register int mycpu; 576 char caller[] = "usimple_unlock"; 577 578 579 if (!usld_lock_common_checks(l, caller)) 580 return; 581 582 mycpu = cpu_number(); 583 584 if (!(l->debug.state & USLOCK_TAKEN)) 585 panic("%s: lock 0x%p hasn't been taken", 586 caller, l); 587 if (l->debug.lock_thread != (void *) current_thread()) 588 panic("%s: unlocking lock 0x%p, owned by thread %p", 589 caller, l, l->debug.lock_thread); 590 if (l->debug.lock_cpu != mycpu) { 591 printf("%s: unlocking lock 0x%p on cpu 0x%x", 592 caller, l, mycpu); 593 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu); 594 panic("%s", caller); 595 } 596 usl_trace(l, mycpu, pc, caller); 597 598 l->debug.unlock_thread = l->debug.lock_thread; 599 l->debug.lock_thread = INVALID_PC; 600 l->debug.state &= ~USLOCK_TAKEN; 601 l->debug.unlock_pc = pc; 602 l->debug.unlock_cpu = mycpu; 603} 604 605 606/* 607 * Debug checks on a usimple_lock just before 608 * attempting to acquire it. 609 * 610 * Preemption isn't guaranteed to be disabled. 611 */ 612void 613usld_lock_try_pre( 614 usimple_lock_t l, 615 pc_t pc) 616{ 617 char caller[] = "usimple_lock_try"; 618 619 if (!usld_lock_common_checks(l, caller)) 620 return; 621 mp_disable_preemption(); 622 usl_trace(l, cpu_number(), pc, caller); 623 mp_enable_preemption(); 624} 625 626 627/* 628 * Debug checks on a usimple_lock just after 629 * successfully attempting to acquire it. 630 * 631 * Preemption has been disabled by the 632 * lock acquisition attempt, so it's safe 633 * to use cpu_number. 634 */ 635void 636usld_lock_try_post( 637 usimple_lock_t l, 638 pc_t pc) 639{ 640 register int mycpu; 641 char caller[] = "successful usimple_lock_try"; 642 643 if (!usld_lock_common_checks(l, caller)) 644 return; 645 646 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) 647 panic("%s: lock 0x%p became uninitialized", 648 caller, l); 649 if ((l->debug.state & USLOCK_TAKEN)) 650 panic("%s: lock 0x%p became TAKEN by someone else", 651 caller, l); 652 653 mycpu = cpu_number(); 654 l->debug.lock_thread = (void *) current_thread(); 655 l->debug.state |= USLOCK_TAKEN; 656 l->debug.lock_pc = pc; 657 l->debug.lock_cpu = mycpu; 658 659 usl_trace(l, mycpu, pc, caller); 660} 661 662 663/* 664 * For very special cases, set traced_lock to point to a 665 * specific lock of interest. The result is a series of 666 * XPRs showing lock operations on that lock. The lock_seq 667 * value is used to show the order of those operations. 668 */ 669usimple_lock_t traced_lock; 670unsigned int lock_seq; 671 672void 673usl_trace( 674 usimple_lock_t l, 675 int mycpu, 676 pc_t pc, 677 const char * op_name) 678{ 679 if (traced_lock == l) { 680 XPR(XPR_SLOCK, 681 "seq %d, cpu %d, %s @ %x\n", 682 (uintptr_t) lock_seq, (uintptr_t) mycpu, 683 (uintptr_t) op_name, (uintptr_t) pc, 0); 684 lock_seq++; 685 } 686} 687 688 689#endif /* USLOCK_DEBUG */ 690 691/* 692 * Routine: lock_alloc 693 * Function: 694 * Allocate a lock for external users who cannot 695 * hard-code the structure definition into their 696 * objects. 697 * For now just use kalloc, but a zone is probably 698 * warranted. 699 */ 700lock_t * 701lock_alloc( 702 boolean_t can_sleep, 703 unsigned short tag, 704 unsigned short tag1) 705{ 706 lock_t *l; 707 708 if ((l = (lock_t *)kalloc(sizeof(lock_t))) != 0) 709 lock_init(l, can_sleep, tag, tag1); 710 return(l); 711} 712 713/* 714 * Routine: lock_free 715 * Function: 716 * Free a lock allocated for external users. 717 * For now just use kfree, but a zone is probably 718 * warranted. 719 */ 720void 721lock_free( 722 lock_t *l) 723{ 724 kfree(l, sizeof(lock_t)); 725} 726 727 728/* 729 * Routine: lock_init 730 * Function: 731 * Initialize a lock; required before use. 732 * Note that clients declare the "struct lock" 733 * variables and then initialize them, rather 734 * than getting a new one from this module. 735 */ 736void 737lock_init( 738 lock_t *l, 739 boolean_t can_sleep, 740 __unused unsigned short tag, 741 __unused unsigned short tag1) 742{ 743 hw_lock_byte_init(&l->lck_rw_interlock); 744 l->lck_rw_want_write = FALSE; 745 l->lck_rw_want_upgrade = FALSE; 746 l->lck_rw_shared_count = 0; 747 l->lck_rw_can_sleep = can_sleep; 748 l->lck_rw_tag = tag; 749 l->lck_rw_priv_excl = 1; 750 l->lck_r_waiting = l->lck_w_waiting = 0; 751} 752 753 754/* 755 * Sleep locks. These use the same data structure and algorithm 756 * as the spin locks, but the process sleeps while it is waiting 757 * for the lock. These work on uniprocessor systems. 758 */ 759 760#define DECREMENTER_TIMEOUT 1000000 761 762void 763lock_write( 764 register lock_t * l) 765{ 766 lck_rw_lock_exclusive(l); 767} 768 769void 770lock_done( 771 register lock_t * l) 772{ 773 (void) lck_rw_done(l); 774} 775 776void 777lock_read( 778 register lock_t * l) 779{ 780 lck_rw_lock_shared(l); 781} 782 783 784/* 785 * Routine: lock_read_to_write 786 * Function: 787 * Improves a read-only lock to one with 788 * write permission. If another reader has 789 * already requested an upgrade to a write lock, 790 * no lock is held upon return. 791 * 792 * Returns FALSE if the upgrade *failed*. 793 */ 794 795boolean_t 796lock_read_to_write( 797 register lock_t * l) 798{ 799 return lck_rw_lock_shared_to_exclusive(l); 800} 801 802void 803lock_write_to_read( 804 register lock_t * l) 805{ 806 lck_rw_lock_exclusive_to_shared(l); 807} 808 809 810 811/* 812 * Routine: lck_rw_alloc_init 813 */ 814lck_rw_t * 815lck_rw_alloc_init( 816 lck_grp_t *grp, 817 lck_attr_t *attr) { 818 lck_rw_t *lck; 819 820 if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) { 821 bzero(lck, sizeof(lck_rw_t)); 822 lck_rw_init(lck, grp, attr); 823 } 824 825 return(lck); 826} 827 828/* 829 * Routine: lck_rw_free 830 */ 831void 832lck_rw_free( 833 lck_rw_t *lck, 834 lck_grp_t *grp) { 835 lck_rw_destroy(lck, grp); 836 kfree(lck, sizeof(lck_rw_t)); 837} 838 839/* 840 * Routine: lck_rw_init 841 */ 842void 843lck_rw_init( 844 lck_rw_t *lck, 845 lck_grp_t *grp, 846 lck_attr_t *attr) 847{ 848 lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ? 849 attr : &LockDefaultLckAttr; 850 851 hw_lock_byte_init(&lck->lck_rw_interlock); 852 lck->lck_rw_want_write = FALSE; 853 lck->lck_rw_want_upgrade = FALSE; 854 lck->lck_rw_shared_count = 0; 855 lck->lck_rw_can_sleep = TRUE; 856 lck->lck_r_waiting = lck->lck_w_waiting = 0; 857 lck->lck_rw_tag = 0; 858 lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val & 859 LCK_ATTR_RW_SHARED_PRIORITY) == 0); 860 861 lck_grp_reference(grp); 862 lck_grp_lckcnt_incr(grp, LCK_TYPE_RW); 863} 864 865/* 866 * Routine: lck_rw_destroy 867 */ 868void 869lck_rw_destroy( 870 lck_rw_t *lck, 871 lck_grp_t *grp) 872{ 873 if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) 874 return; 875#if MACH_LDEBUG 876 lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD); 877#endif 878 lck->lck_rw_tag = LCK_RW_TAG_DESTROYED; 879 lck_grp_lckcnt_decr(grp, LCK_TYPE_RW); 880 lck_grp_deallocate(grp); 881 return; 882} 883 884/* 885 * Sleep locks. These use the same data structure and algorithm 886 * as the spin locks, but the process sleeps while it is waiting 887 * for the lock. These work on uniprocessor systems. 888 */ 889 890#define DECREMENTER_TIMEOUT 1000000 891 892#define RW_LOCK_READER_EVENT(x) \ 893 ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_tag)))) 894 895#define RW_LOCK_WRITER_EVENT(x) \ 896 ((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_pad8)))) 897 898/* 899 * We disable interrupts while holding the RW interlock to prevent an 900 * interrupt from exacerbating hold time. 901 * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock(). 902 */ 903static boolean_t 904lck_interlock_lock(lck_rw_t *lck) 905{ 906 boolean_t istate; 907 908 istate = ml_set_interrupts_enabled(FALSE); 909 hw_lock_byte_lock(&lck->lck_rw_interlock); 910 911 return istate; 912} 913 914static void 915lck_interlock_unlock(lck_rw_t *lck, boolean_t istate) 916{ 917 hw_lock_byte_unlock(&lck->lck_rw_interlock); 918 ml_set_interrupts_enabled(istate); 919} 920 921/* 922 * This inline is used when busy-waiting for an rw lock. 923 * If interrupts were disabled when the lock primitive was called, 924 * we poll the IPI handler for pending tlb flushes. 925 * XXX This is a hack to avoid deadlocking on the pmap_system_lock. 926 */ 927static inline void 928lck_rw_lock_pause(boolean_t interrupts_enabled) 929{ 930 if (!interrupts_enabled) 931 handle_pending_TLB_flushes(); 932 cpu_pause(); 933} 934 935 936/* 937 * compute the deadline to spin against when 938 * waiting for a change of state on a lck_rw_t 939 */ 940static inline uint64_t 941lck_rw_deadline_for_spin(lck_rw_t *lck) 942{ 943 if (lck->lck_rw_can_sleep) { 944 if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) { 945 /* 946 * there are already threads waiting on this lock... this 947 * implies that they have spun beyond their deadlines waiting for 948 * the desired state to show up so we will not bother spinning at this time... 949 * or 950 * the current number of threads sharing this lock exceeds our capacity to run them 951 * concurrently and since all states we're going to spin for require the rw_shared_count 952 * to be at 0, we'll not bother spinning since the latency for this to happen is 953 * unpredictable... 954 */ 955 return (mach_absolute_time()); 956 } 957 return (mach_absolute_time() + MutexSpin); 958 } else 959 return (mach_absolute_time() + (100000LL * 1000000000LL)); 960} 961 962 963/* 964 * Routine: lck_rw_lock_exclusive 965 */ 966void 967lck_rw_lock_exclusive_gen( 968 lck_rw_t *lck) 969{ 970 uint64_t deadline = 0; 971 int slept = 0; 972 int gotlock = 0; 973 int lockheld = 0; 974 wait_result_t res = 0; 975 boolean_t istate = -1; 976 977#if CONFIG_DTRACE 978 boolean_t dtrace_ls_initialized = FALSE; 979 boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE; 980 uint64_t wait_interval = 0; 981 int readers_at_sleep = 0; 982#endif 983 984 /* 985 * Try to acquire the lck_rw_want_write bit. 986 */ 987 while ( !lck_rw_grab_want(lck)) { 988 989#if CONFIG_DTRACE 990 if (dtrace_ls_initialized == FALSE) { 991 dtrace_ls_initialized = TRUE; 992 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0); 993 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0); 994 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block; 995 if (dtrace_ls_enabled) { 996 /* 997 * Either sleeping or spinning is happening, 998 * start a timing of our delay interval now. 999 */ 1000 readers_at_sleep = lck->lck_rw_shared_count; 1001 wait_interval = mach_absolute_time(); 1002 } 1003 } 1004#endif 1005 if (istate == -1) 1006 istate = ml_get_interrupts_enabled(); 1007 1008 deadline = lck_rw_deadline_for_spin(lck); 1009 1010 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0); 1011 1012 while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline) 1013 lck_rw_lock_pause(istate); 1014 1015 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, gotlock, 0); 1016 1017 if (gotlock) 1018 break; 1019 /* 1020 * if we get here, the deadline has expired w/o us 1021 * being able to grab the lock exclusively 1022 * check to see if we're allowed to do a thread_block 1023 */ 1024 if (lck->lck_rw_can_sleep) { 1025 1026 istate = lck_interlock_lock(lck); 1027 1028 if (lck->lck_rw_want_write) { 1029 1030 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0); 1031 1032 lck->lck_w_waiting = TRUE; 1033 1034 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT); 1035 lck_interlock_unlock(lck, istate); 1036 1037 if (res == THREAD_WAITING) { 1038 res = thread_block(THREAD_CONTINUE_NULL); 1039 slept++; 1040 } 1041 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0); 1042 } else { 1043 lck->lck_rw_want_write = TRUE; 1044 lck_interlock_unlock(lck, istate); 1045 break; 1046 } 1047 } 1048 } 1049 /* 1050 * Wait for readers (and upgrades) to finish... 1051 * the test for these conditions must be done simultaneously with 1052 * a check of the interlock not being held since 1053 * the rw_shared_count will drop to 0 first and then want_upgrade 1054 * will be set to 1 in the shared_to_exclusive scenario... those 1055 * adjustments are done behind the interlock and represent an 1056 * atomic change in state and must be considered as such 1057 * however, once we see the read count at 0, the want_upgrade not set 1058 * and the interlock not held, we are safe to proceed 1059 */ 1060 while (lck_rw_held_read_or_upgrade(lck)) { 1061 1062#if CONFIG_DTRACE 1063 /* 1064 * Either sleeping or spinning is happening, start 1065 * a timing of our delay interval now. If we set it 1066 * to -1 we don't have accurate data so we cannot later 1067 * decide to record a dtrace spin or sleep event. 1068 */ 1069 if (dtrace_ls_initialized == FALSE) { 1070 dtrace_ls_initialized = TRUE; 1071 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0); 1072 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0); 1073 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block; 1074 if (dtrace_ls_enabled) { 1075 /* 1076 * Either sleeping or spinning is happening, 1077 * start a timing of our delay interval now. 1078 */ 1079 readers_at_sleep = lck->lck_rw_shared_count; 1080 wait_interval = mach_absolute_time(); 1081 } 1082 } 1083#endif 1084 if (istate == -1) 1085 istate = ml_get_interrupts_enabled(); 1086 1087 deadline = lck_rw_deadline_for_spin(lck); 1088 1089 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0); 1090 1091 while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline) 1092 lck_rw_lock_pause(istate); 1093 1094 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, (int)lck, 0, 0, lockheld, 0); 1095 1096 if ( !lockheld) 1097 break; 1098 /* 1099 * if we get here, the deadline has expired w/o us 1100 * being able to grab the lock exclusively 1101 * check to see if we're allowed to do a thread_block 1102 */ 1103 if (lck->lck_rw_can_sleep) { 1104 1105 istate = lck_interlock_lock(lck); 1106 1107 if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) { 1108 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0); 1109 1110 lck->lck_w_waiting = TRUE; 1111 1112 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT); 1113 lck_interlock_unlock(lck, istate); 1114 1115 if (res == THREAD_WAITING) { 1116 res = thread_block(THREAD_CONTINUE_NULL); 1117 slept++; 1118 } 1119 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, (int)lck, res, slept, 0, 0); 1120 } else { 1121 lck_interlock_unlock(lck, istate); 1122 /* 1123 * must own the lock now, since we checked for 1124 * readers or upgrade owner behind the interlock 1125 * no need for a call to 'lck_rw_held_read_or_upgrade' 1126 */ 1127 break; 1128 } 1129 } 1130 } 1131 1132#if CONFIG_DTRACE 1133 /* 1134 * Decide what latencies we suffered that are Dtrace events. 1135 * If we have set wait_interval, then we either spun or slept. 1136 * At least we get out from under the interlock before we record 1137 * which is the best we can do here to minimize the impact 1138 * of the tracing. 1139 * If we have set wait_interval to -1, then dtrace was not enabled when we 1140 * started sleeping/spinning so we don't record this event. 1141 */ 1142 if (dtrace_ls_enabled == TRUE) { 1143 if (slept == 0) { 1144 LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck, 1145 mach_absolute_time() - wait_interval, 1); 1146 } else { 1147 /* 1148 * For the blocking case, we also record if when we blocked 1149 * it was held for read or write, and how many readers. 1150 * Notice that above we recorded this before we dropped 1151 * the interlock so the count is accurate. 1152 */ 1153 LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck, 1154 mach_absolute_time() - wait_interval, 1, 1155 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep); 1156 } 1157 } 1158 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1); 1159#endif 1160} 1161 1162 1163/* 1164 * Routine: lck_rw_done_gen 1165 * 1166 * called from the assembly language wrapper... 1167 * prior_lock_state is the value in the 1st 1168 * word of the lock at the time of a successful 1169 * atomic compare and exchange with the new value... 1170 * it represents the state of the lock before we 1171 * decremented the rw_shared_count or cleared either 1172 * rw_want_upgrade or rw_want_write and 1173 * the lck_x_waiting bits... since the wrapper 1174 * routine has already changed the state atomically, 1175 * we just need to decide if we should 1176 * wake up anyone and what value to return... we do 1177 * this by examining the state of the lock before 1178 * we changed it 1179 */ 1180lck_rw_type_t 1181lck_rw_done_gen( 1182 lck_rw_t *lck, 1183 int prior_lock_state) 1184{ 1185 lck_rw_t *fake_lck; 1186 lck_rw_type_t lock_type; 1187 thread_t thread = current_thread(); 1188 uint32_t rwlock_count; 1189 1190 /* Check if dropping the lock means that we need to unpromote */ 1191 rwlock_count = thread->rwlock_count--; 1192#if MACH_LDEBUG 1193 if (rwlock_count == 0) { 1194 panic("rw lock count underflow for thread %p", thread); 1195 } 1196#endif 1197 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { 1198 /* sched_flags checked without lock, but will be rechecked while clearing */ 1199 lck_rw_clear_promotion(thread); 1200 } 1201 1202 /* 1203 * prior_lock state is a snapshot of the 1st word of the 1204 * lock in question... we'll fake up a pointer to it 1205 * and carefully not access anything beyond whats defined 1206 * in the first word of a lck_rw_t 1207 */ 1208 fake_lck = (lck_rw_t *)&prior_lock_state; 1209 1210 if (fake_lck->lck_rw_shared_count <= 1) { 1211 if (fake_lck->lck_w_waiting) 1212 thread_wakeup(RW_LOCK_WRITER_EVENT(lck)); 1213 1214 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) 1215 thread_wakeup(RW_LOCK_READER_EVENT(lck)); 1216 } 1217 if (fake_lck->lck_rw_shared_count) 1218 lock_type = LCK_RW_TYPE_SHARED; 1219 else 1220 lock_type = LCK_RW_TYPE_EXCLUSIVE; 1221 1222#if CONFIG_DTRACE 1223 LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1); 1224#endif 1225 1226 return(lock_type); 1227} 1228 1229 1230/* 1231 * Routine: lck_rw_unlock 1232 */ 1233void 1234lck_rw_unlock( 1235 lck_rw_t *lck, 1236 lck_rw_type_t lck_rw_type) 1237{ 1238 if (lck_rw_type == LCK_RW_TYPE_SHARED) 1239 lck_rw_unlock_shared(lck); 1240 else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) 1241 lck_rw_unlock_exclusive(lck); 1242 else 1243 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type); 1244} 1245 1246 1247/* 1248 * Routine: lck_rw_unlock_shared 1249 */ 1250void 1251lck_rw_unlock_shared( 1252 lck_rw_t *lck) 1253{ 1254 lck_rw_type_t ret; 1255 1256 ret = lck_rw_done(lck); 1257 1258 if (ret != LCK_RW_TYPE_SHARED) 1259 panic("lck_rw_unlock(): lock held in mode: %d\n", ret); 1260} 1261 1262 1263/* 1264 * Routine: lck_rw_unlock_exclusive 1265 */ 1266void 1267lck_rw_unlock_exclusive( 1268 lck_rw_t *lck) 1269{ 1270 lck_rw_type_t ret; 1271 1272 ret = lck_rw_done(lck); 1273 1274 if (ret != LCK_RW_TYPE_EXCLUSIVE) 1275 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret); 1276} 1277 1278 1279/* 1280 * Routine: lck_rw_lock 1281 */ 1282void 1283lck_rw_lock( 1284 lck_rw_t *lck, 1285 lck_rw_type_t lck_rw_type) 1286{ 1287 if (lck_rw_type == LCK_RW_TYPE_SHARED) 1288 lck_rw_lock_shared(lck); 1289 else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) 1290 lck_rw_lock_exclusive(lck); 1291 else 1292 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type); 1293} 1294 1295 1296/* 1297 * Routine: lck_rw_lock_shared_gen 1298 * Function: 1299 * assembly fast path code has determined that this lock 1300 * is held exclusively... this is where we spin/block 1301 * until we can acquire the lock in the shared mode 1302 */ 1303void 1304lck_rw_lock_shared_gen( 1305 lck_rw_t *lck) 1306{ 1307 uint64_t deadline = 0; 1308 int gotlock = 0; 1309 int slept = 0; 1310 wait_result_t res = 0; 1311 boolean_t istate = -1; 1312 1313#if CONFIG_DTRACE 1314 uint64_t wait_interval = 0; 1315 int readers_at_sleep = 0; 1316 boolean_t dtrace_ls_initialized = FALSE; 1317 boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE; 1318#endif 1319 1320 while ( !lck_rw_grab_shared(lck)) { 1321 1322#if CONFIG_DTRACE 1323 if (dtrace_ls_initialized == FALSE) { 1324 dtrace_ls_initialized = TRUE; 1325 dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0); 1326 dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0); 1327 dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block; 1328 if (dtrace_ls_enabled) { 1329 /* 1330 * Either sleeping or spinning is happening, 1331 * start a timing of our delay interval now. 1332 */ 1333 readers_at_sleep = lck->lck_rw_shared_count; 1334 wait_interval = mach_absolute_time(); 1335 } 1336 } 1337#endif 1338 if (istate == -1) 1339 istate = ml_get_interrupts_enabled(); 1340 1341 deadline = lck_rw_deadline_for_spin(lck); 1342 1343 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START, 1344 (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0); 1345 1346 while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline) 1347 lck_rw_lock_pause(istate); 1348 1349 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END, 1350 (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0); 1351 1352 if (gotlock) 1353 break; 1354 /* 1355 * if we get here, the deadline has expired w/o us 1356 * being able to grab the lock for read 1357 * check to see if we're allowed to do a thread_block 1358 */ 1359 if (lck->lck_rw_can_sleep) { 1360 1361 istate = lck_interlock_lock(lck); 1362 1363 if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) && 1364 ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) { 1365 1366 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START, 1367 (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0); 1368 1369 lck->lck_r_waiting = TRUE; 1370 1371 res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT); 1372 lck_interlock_unlock(lck, istate); 1373 1374 if (res == THREAD_WAITING) { 1375 res = thread_block(THREAD_CONTINUE_NULL); 1376 slept++; 1377 } 1378 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END, 1379 (int)lck, res, slept, 0, 0); 1380 } else { 1381 lck->lck_rw_shared_count++; 1382 lck_interlock_unlock(lck, istate); 1383 break; 1384 } 1385 } 1386 } 1387 1388#if CONFIG_DTRACE 1389 if (dtrace_ls_enabled == TRUE) { 1390 if (slept == 0) { 1391 LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0); 1392 } else { 1393 LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck, 1394 mach_absolute_time() - wait_interval, 0, 1395 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep); 1396 } 1397 } 1398 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0); 1399#endif 1400} 1401 1402 1403/* 1404 * Routine: lck_rw_lock_shared_to_exclusive_failure 1405 * Function: 1406 * assembly fast path code has already dropped our read 1407 * count and determined that someone else owns 'lck_rw_want_upgrade' 1408 * if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting' 1409 * all we need to do here is determine if a wakeup is needed 1410 */ 1411boolean_t 1412lck_rw_lock_shared_to_exclusive_failure( 1413 lck_rw_t *lck, 1414 int prior_lock_state) 1415{ 1416 lck_rw_t *fake_lck; 1417 thread_t thread = current_thread(); 1418 uint32_t rwlock_count; 1419 1420 /* Check if dropping the lock means that we need to unpromote */ 1421 rwlock_count = thread->rwlock_count--; 1422#if MACH_LDEBUG 1423 if (rwlock_count == 0) { 1424 panic("rw lock count underflow for thread %p", thread); 1425 } 1426#endif 1427 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { 1428 /* sched_flags checked without lock, but will be rechecked while clearing */ 1429 lck_rw_clear_promotion(thread); 1430 } 1431 1432 /* 1433 * prior_lock state is a snapshot of the 1st word of the 1434 * lock in question... we'll fake up a pointer to it 1435 * and carefully not access anything beyond whats defined 1436 * in the first word of a lck_rw_t 1437 */ 1438 fake_lck = (lck_rw_t *)&prior_lock_state; 1439 1440 if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) { 1441 /* 1442 * Someone else has requested upgrade. 1443 * Since we've released the read lock, wake 1444 * him up if he's blocked waiting 1445 */ 1446 thread_wakeup(RW_LOCK_WRITER_EVENT(lck)); 1447 } 1448 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE, 1449 (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0); 1450 1451 return (FALSE); 1452} 1453 1454 1455/* 1456 * Routine: lck_rw_lock_shared_to_exclusive_failure 1457 * Function: 1458 * assembly fast path code has already dropped our read 1459 * count and successfully acquired 'lck_rw_want_upgrade' 1460 * we just need to wait for the rest of the readers to drain 1461 * and then we can return as the exclusive holder of this lock 1462 */ 1463boolean_t 1464lck_rw_lock_shared_to_exclusive_success( 1465 lck_rw_t *lck) 1466{ 1467 uint64_t deadline = 0; 1468 int slept = 0; 1469 int still_shared = 0; 1470 wait_result_t res; 1471 boolean_t istate = -1; 1472 1473#if CONFIG_DTRACE 1474 uint64_t wait_interval = 0; 1475 int readers_at_sleep = 0; 1476 boolean_t dtrace_ls_initialized = FALSE; 1477 boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE; 1478#endif 1479 1480 while (lck->lck_rw_shared_count != 0) { 1481 1482#if CONFIG_DTRACE 1483 if (dtrace_ls_initialized == FALSE) { 1484 dtrace_ls_initialized = TRUE; 1485 dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0); 1486 dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0); 1487 dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block; 1488 if (dtrace_ls_enabled) { 1489 /* 1490 * Either sleeping or spinning is happening, 1491 * start a timing of our delay interval now. 1492 */ 1493 readers_at_sleep = lck->lck_rw_shared_count; 1494 wait_interval = mach_absolute_time(); 1495 } 1496 } 1497#endif 1498 if (istate == -1) 1499 istate = ml_get_interrupts_enabled(); 1500 1501 deadline = lck_rw_deadline_for_spin(lck); 1502 1503 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START, 1504 (int)lck, lck->lck_rw_shared_count, 0, 0, 0); 1505 1506 while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline) 1507 lck_rw_lock_pause(istate); 1508 1509 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END, 1510 (int)lck, lck->lck_rw_shared_count, 0, 0, 0); 1511 1512 if ( !still_shared) 1513 break; 1514 /* 1515 * if we get here, the deadline has expired w/o 1516 * the rw_shared_count having drained to 0 1517 * check to see if we're allowed to do a thread_block 1518 */ 1519 if (lck->lck_rw_can_sleep) { 1520 1521 istate = lck_interlock_lock(lck); 1522 1523 if (lck->lck_rw_shared_count != 0) { 1524 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START, 1525 (int)lck, lck->lck_rw_shared_count, 0, 0, 0); 1526 1527 lck->lck_w_waiting = TRUE; 1528 1529 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT); 1530 lck_interlock_unlock(lck, istate); 1531 1532 if (res == THREAD_WAITING) { 1533 res = thread_block(THREAD_CONTINUE_NULL); 1534 slept++; 1535 } 1536 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END, 1537 (int)lck, res, slept, 0, 0); 1538 } else { 1539 lck_interlock_unlock(lck, istate); 1540 break; 1541 } 1542 } 1543 } 1544#if CONFIG_DTRACE 1545 /* 1546 * We infer whether we took the sleep/spin path above by checking readers_at_sleep. 1547 */ 1548 if (dtrace_ls_enabled == TRUE) { 1549 if (slept == 0) { 1550 LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0); 1551 } else { 1552 LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck, 1553 mach_absolute_time() - wait_interval, 1, 1554 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep); 1555 } 1556 } 1557 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1); 1558#endif 1559 return (TRUE); 1560} 1561 1562 1563/* 1564 * Routine: lck_rw_lock_exclusive_to_shared 1565 * Function: 1566 * assembly fast path has already dropped 1567 * our exclusive state and bumped lck_rw_shared_count 1568 * all we need to do here is determine if anyone 1569 * needs to be awakened. 1570 */ 1571void 1572lck_rw_lock_exclusive_to_shared_gen( 1573 lck_rw_t *lck, 1574 int prior_lock_state) 1575{ 1576 lck_rw_t *fake_lck; 1577 1578 /* 1579 * prior_lock state is a snapshot of the 1st word of the 1580 * lock in question... we'll fake up a pointer to it 1581 * and carefully not access anything beyond whats defined 1582 * in the first word of a lck_rw_t 1583 */ 1584 fake_lck = (lck_rw_t *)&prior_lock_state; 1585 1586 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START, 1587 (int)lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0); 1588 1589 /* 1590 * don't wake up anyone waiting to take the lock exclusively 1591 * since we hold a read count... when the read count drops to 0, 1592 * the writers will be woken. 1593 * 1594 * wake up any waiting readers if we don't have any writers waiting, 1595 * or the lock is NOT marked as rw_priv_excl (writers have privilege) 1596 */ 1597 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) 1598 thread_wakeup(RW_LOCK_READER_EVENT(lck)); 1599 1600 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END, 1601 (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0); 1602 1603#if CONFIG_DTRACE 1604 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0); 1605#endif 1606} 1607 1608 1609/* 1610 * Routine: lck_rw_try_lock 1611 */ 1612boolean_t 1613lck_rw_try_lock( 1614 lck_rw_t *lck, 1615 lck_rw_type_t lck_rw_type) 1616{ 1617 if (lck_rw_type == LCK_RW_TYPE_SHARED) 1618 return(lck_rw_try_lock_shared(lck)); 1619 else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) 1620 return(lck_rw_try_lock_exclusive(lck)); 1621 else 1622 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type); 1623 return(FALSE); 1624} 1625 1626 1627void 1628lck_rw_assert( 1629 lck_rw_t *lck, 1630 unsigned int type) 1631{ 1632 switch (type) { 1633 case LCK_RW_ASSERT_SHARED: 1634 if (lck->lck_rw_shared_count != 0) { 1635 return; 1636 } 1637 break; 1638 case LCK_RW_ASSERT_EXCLUSIVE: 1639 if ((lck->lck_rw_want_write || 1640 lck->lck_rw_want_upgrade) && 1641 lck->lck_rw_shared_count == 0) { 1642 return; 1643 } 1644 break; 1645 case LCK_RW_ASSERT_HELD: 1646 if (lck->lck_rw_want_write || 1647 lck->lck_rw_want_upgrade || 1648 lck->lck_rw_shared_count != 0) { 1649 return; 1650 } 1651 break; 1652 case LCK_RW_ASSERT_NOTHELD: 1653 if (!(lck->lck_rw_want_write || 1654 lck->lck_rw_want_upgrade || 1655 lck->lck_rw_shared_count != 0)) { 1656 return; 1657 } 1658 break; 1659 default: 1660 break; 1661 } 1662 1663 panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck); 1664} 1665 1666/* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */ 1667void 1668lck_rw_clear_promotions_x86(thread_t thread) 1669{ 1670#if MACH_LDEBUG 1671 /* It's fatal to leave a RW lock locked and return to userspace */ 1672 panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread); 1673#else 1674 /* Paper over the issue */ 1675 thread->rwlock_count = 0; 1676 lck_rw_clear_promotion(thread); 1677#endif 1678} 1679 1680 1681#ifdef MUTEX_ZONE 1682extern zone_t lck_mtx_zone; 1683#endif 1684/* 1685 * Routine: lck_mtx_alloc_init 1686 */ 1687lck_mtx_t * 1688lck_mtx_alloc_init( 1689 lck_grp_t *grp, 1690 lck_attr_t *attr) 1691{ 1692 lck_mtx_t *lck; 1693#ifdef MUTEX_ZONE 1694 if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0) 1695 lck_mtx_init(lck, grp, attr); 1696#else 1697 if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0) 1698 lck_mtx_init(lck, grp, attr); 1699#endif 1700 return(lck); 1701} 1702 1703/* 1704 * Routine: lck_mtx_free 1705 */ 1706void 1707lck_mtx_free( 1708 lck_mtx_t *lck, 1709 lck_grp_t *grp) 1710{ 1711 lck_mtx_destroy(lck, grp); 1712#ifdef MUTEX_ZONE 1713 zfree(lck_mtx_zone, lck); 1714#else 1715 kfree(lck, sizeof(lck_mtx_t)); 1716#endif 1717} 1718 1719/* 1720 * Routine: lck_mtx_ext_init 1721 */ 1722static void 1723lck_mtx_ext_init( 1724 lck_mtx_ext_t *lck, 1725 lck_grp_t *grp, 1726 lck_attr_t *attr) 1727{ 1728 bzero((void *)lck, sizeof(lck_mtx_ext_t)); 1729 1730 if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) { 1731 lck->lck_mtx_deb.type = MUTEX_TAG; 1732 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG; 1733 } 1734 1735 lck->lck_mtx_grp = grp; 1736 1737 if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) 1738 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT; 1739 1740 lck->lck_mtx.lck_mtx_is_ext = 1; 1741 lck->lck_mtx.lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF; 1742} 1743 1744/* 1745 * Routine: lck_mtx_init 1746 */ 1747void 1748lck_mtx_init( 1749 lck_mtx_t *lck, 1750 lck_grp_t *grp, 1751 lck_attr_t *attr) 1752{ 1753 lck_mtx_ext_t *lck_ext; 1754 lck_attr_t *lck_attr; 1755 1756 if (attr != LCK_ATTR_NULL) 1757 lck_attr = attr; 1758 else 1759 lck_attr = &LockDefaultLckAttr; 1760 1761 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) { 1762 if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) { 1763 lck_mtx_ext_init(lck_ext, grp, lck_attr); 1764 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT; 1765 lck->lck_mtx_ptr = lck_ext; 1766 } 1767 } else { 1768 lck->lck_mtx_owner = 0; 1769 lck->lck_mtx_state = 0; 1770 } 1771 lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF; 1772 lck_grp_reference(grp); 1773 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX); 1774} 1775 1776/* 1777 * Routine: lck_mtx_init_ext 1778 */ 1779void 1780lck_mtx_init_ext( 1781 lck_mtx_t *lck, 1782 lck_mtx_ext_t *lck_ext, 1783 lck_grp_t *grp, 1784 lck_attr_t *attr) 1785{ 1786 lck_attr_t *lck_attr; 1787 1788 if (attr != LCK_ATTR_NULL) 1789 lck_attr = attr; 1790 else 1791 lck_attr = &LockDefaultLckAttr; 1792 1793 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) { 1794 lck_mtx_ext_init(lck_ext, grp, lck_attr); 1795 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT; 1796 lck->lck_mtx_ptr = lck_ext; 1797 } else { 1798 lck->lck_mtx_owner = 0; 1799 lck->lck_mtx_state = 0; 1800 } 1801 lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF; 1802 1803 lck_grp_reference(grp); 1804 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX); 1805} 1806 1807/* 1808 * Routine: lck_mtx_destroy 1809 */ 1810void 1811lck_mtx_destroy( 1812 lck_mtx_t *lck, 1813 lck_grp_t *grp) 1814{ 1815 boolean_t lck_is_indirect; 1816 1817 if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) 1818 return; 1819#if MACH_LDEBUG 1820 lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED); 1821#endif 1822 lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT); 1823 1824 lck_mtx_lock_mark_destroyed(lck); 1825 1826 if (lck_is_indirect) 1827 kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t)); 1828 lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX); 1829 lck_grp_deallocate(grp); 1830 return; 1831} 1832 1833 1834#define LCK_MTX_LCK_WAIT_CODE 0x20 1835#define LCK_MTX_LCK_WAKEUP_CODE 0x21 1836#define LCK_MTX_LCK_SPIN_CODE 0x22 1837#define LCK_MTX_LCK_ACQUIRE_CODE 0x23 1838#define LCK_MTX_LCK_DEMOTE_CODE 0x24 1839 1840 1841/* 1842 * Routine: lck_mtx_unlock_wakeup_x86 1843 * 1844 * Invoked on unlock when there is 1845 * contention (i.e. the assembly routine sees that 1846 * that mutex->lck_mtx_waiters != 0 or 1847 * that mutex->lck_mtx_promoted != 0... 1848 * 1849 * neither the mutex or interlock is held 1850 */ 1851void 1852lck_mtx_unlock_wakeup_x86 ( 1853 lck_mtx_t *mutex, 1854 int prior_lock_state) 1855{ 1856 lck_mtx_t fake_lck; 1857 1858 /* 1859 * prior_lock state is a snapshot of the 2nd word of the 1860 * lock in question... we'll fake up a lock with the bits 1861 * copied into place and carefully not access anything 1862 * beyond whats defined in the second word of a lck_mtx_t 1863 */ 1864 fake_lck.lck_mtx_state = prior_lock_state; 1865 1866 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START, 1867 mutex, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0); 1868 1869 if (__probable(fake_lck.lck_mtx_waiters)) { 1870 if (fake_lck.lck_mtx_waiters > 1) 1871 thread_wakeup_one_with_pri((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int)), fake_lck.lck_mtx_pri); 1872 else 1873 thread_wakeup_one((event_t)(((unsigned int*)mutex)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int))); 1874 } 1875 1876 if (__improbable(fake_lck.lck_mtx_promoted)) { 1877 thread_t thread = current_thread(); 1878 1879 1880 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE, 1881 thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0); 1882 1883 if (thread->promotions > 0) { 1884 spl_t s = splsched(); 1885 1886 thread_lock(thread); 1887 1888 if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) { 1889 1890 thread->sched_flags &= ~TH_SFLAG_PROMOTED; 1891 1892 if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) { 1893 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE, 1894 thread->sched_pri, DEPRESSPRI, 0, mutex, 0); 1895 1896 set_sched_pri(thread, DEPRESSPRI); 1897 } 1898 else { 1899 if (thread->priority < thread->sched_pri) { 1900 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE, 1901 thread->sched_pri, thread->priority, 0, mutex, 0); 1902 1903 SCHED(compute_priority)(thread, FALSE); 1904 } 1905 } 1906 } 1907 thread_unlock(thread); 1908 splx(s); 1909 } 1910 } 1911 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END, 1912 mutex, 0, mutex->lck_mtx_waiters, 0, 0); 1913} 1914 1915 1916/* 1917 * Routine: lck_mtx_lock_acquire_x86 1918 * 1919 * Invoked on acquiring the mutex when there is 1920 * contention (i.e. the assembly routine sees that 1921 * that mutex->lck_mtx_waiters != 0 or 1922 * thread->was_promoted_on_wakeup != 0)... 1923 * 1924 * mutex is owned... interlock is held... preemption is disabled 1925 */ 1926void 1927lck_mtx_lock_acquire_x86( 1928 lck_mtx_t *mutex) 1929{ 1930 thread_t thread; 1931 integer_t priority; 1932 spl_t s; 1933 1934 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START, 1935 mutex, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); 1936 1937 if (mutex->lck_mtx_waiters) 1938 priority = mutex->lck_mtx_pri; 1939 else 1940 priority = 0; 1941 1942 thread = (thread_t)mutex->lck_mtx_owner; /* faster then current_thread() */ 1943 1944 if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) { 1945 1946 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE, 1947 thread->sched_pri, priority, thread->was_promoted_on_wakeup, mutex, 0); 1948 1949 s = splsched(); 1950 thread_lock(thread); 1951 1952 if (thread->sched_pri < priority) { 1953 /* Do not promote into the realtime priority band */ 1954 assert(priority <= MAXPRI_KERNEL); 1955 set_sched_pri(thread, priority); 1956 } 1957 if (mutex->lck_mtx_promoted == 0) { 1958 mutex->lck_mtx_promoted = 1; 1959 1960 thread->promotions++; 1961 thread->sched_flags |= TH_SFLAG_PROMOTED; 1962 } 1963 thread->was_promoted_on_wakeup = 0; 1964 1965 thread_unlock(thread); 1966 splx(s); 1967 } 1968 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END, 1969 mutex, 0, mutex->lck_mtx_waiters, 0, 0); 1970} 1971 1972 1973 1974/* 1975 * Routine: lck_mtx_lock_spinwait_x86 1976 * 1977 * Invoked trying to acquire a mutex when there is contention but 1978 * the holder is running on another processor. We spin for up to a maximum 1979 * time waiting for the lock to be released. 1980 * 1981 * Called with the interlock unlocked. 1982 * returns 0 if mutex acquired 1983 * returns 1 if we spun 1984 * returns 2 if we didn't spin due to the holder not running 1985 */ 1986int 1987lck_mtx_lock_spinwait_x86( 1988 lck_mtx_t *mutex) 1989{ 1990 thread_t holder; 1991 uint64_t deadline; 1992 int retval = 1; 1993 int loopcount = 0; 1994 1995 1996 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START, 1997 mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, 0, 0); 1998 1999 deadline = mach_absolute_time() + MutexSpin; 2000 2001 /* 2002 * Spin while: 2003 * - mutex is locked, and 2004 * - its locked as a spin lock, and 2005 * - owner is running on another processor, and 2006 * - owner (processor) is not idling, and 2007 * - we haven't spun for long enough. 2008 */ 2009 do { 2010 if (__probable(lck_mtx_lock_grab_mutex(mutex))) { 2011 retval = 0; 2012 break; 2013 } 2014 if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) { 2015 2016 if ( !(holder->machine.specFlags & OnProc) || 2017 (holder->state & TH_IDLE)) { 2018 if (loopcount == 0) 2019 retval = 2; 2020 break; 2021 } 2022 } 2023 cpu_pause(); 2024 2025 loopcount++; 2026 2027 } while (mach_absolute_time() < deadline); 2028 2029 2030#if CONFIG_DTRACE 2031 /* 2032 * We've already kept a count via deadline of how long we spun. 2033 * If dtrace is active, then we compute backwards to decide how 2034 * long we spun. 2035 * 2036 * Note that we record a different probe id depending on whether 2037 * this is a direct or indirect mutex. This allows us to 2038 * penalize only lock groups that have debug/stats enabled 2039 * with dtrace processing if desired. 2040 */ 2041 if (__probable(mutex->lck_mtx_is_ext == 0)) { 2042 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex, 2043 mach_absolute_time() - (deadline - MutexSpin)); 2044 } else { 2045 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex, 2046 mach_absolute_time() - (deadline - MutexSpin)); 2047 } 2048 /* The lockstat acquire event is recorded by the assembly code beneath us. */ 2049#endif 2050 2051 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END, 2052 mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, retval, 0); 2053 2054 return retval; 2055} 2056 2057 2058 2059/* 2060 * Routine: lck_mtx_lock_wait_x86 2061 * 2062 * Invoked in order to wait on contention. 2063 * 2064 * Called with the interlock locked and 2065 * preemption disabled... 2066 * returns it unlocked and with preemption enabled 2067 */ 2068void 2069lck_mtx_lock_wait_x86 ( 2070 lck_mtx_t *mutex) 2071{ 2072 thread_t self = current_thread(); 2073 thread_t holder; 2074 integer_t priority; 2075 spl_t s; 2076#if CONFIG_DTRACE 2077 uint64_t sleep_start = 0; 2078 2079 if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) { 2080 sleep_start = mach_absolute_time(); 2081 } 2082#endif 2083 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, 2084 mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); 2085 2086 priority = self->sched_pri; 2087 2088 if (priority < self->priority) 2089 priority = self->priority; 2090 if (priority < BASEPRI_DEFAULT) 2091 priority = BASEPRI_DEFAULT; 2092 2093 /* Do not promote into the realtime priority band */ 2094 priority = MIN(priority, MAXPRI_KERNEL); 2095 2096 if (mutex->lck_mtx_waiters == 0 || priority > mutex->lck_mtx_pri) 2097 mutex->lck_mtx_pri = priority; 2098 mutex->lck_mtx_waiters++; 2099 2100 if ( (holder = (thread_t)mutex->lck_mtx_owner) && 2101 holder->sched_pri < mutex->lck_mtx_pri ) { 2102 /* Assert that we're not altering the priority of a 2103 * MAXPRI_KERNEL or RT prio band thread 2104 */ 2105 assert(holder->sched_pri < MAXPRI_KERNEL); 2106 s = splsched(); 2107 thread_lock(holder); 2108 2109 if (holder->sched_pri < mutex->lck_mtx_pri) { 2110 KERNEL_DEBUG_CONSTANT( 2111 MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE, 2112 holder->sched_pri, priority, thread_tid(holder), mutex, 0); 2113 2114 set_sched_pri(holder, priority); 2115 2116 if (mutex->lck_mtx_promoted == 0) { 2117 holder->promotions++; 2118 holder->sched_flags |= TH_SFLAG_PROMOTED; 2119 2120 mutex->lck_mtx_promoted = 1; 2121 } 2122 } 2123 thread_unlock(holder); 2124 splx(s); 2125 } 2126 assert_wait((event_t)(((unsigned int*)mutex)+((sizeof(lck_mtx_t)-1)/sizeof(unsigned int))), THREAD_UNINT); 2127 2128 lck_mtx_ilk_unlock(mutex); 2129 2130 thread_block(THREAD_CONTINUE_NULL); 2131 2132 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, 2133 mutex, mutex->lck_mtx_owner, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); 2134 2135#if CONFIG_DTRACE 2136 /* 2137 * Record the Dtrace lockstat probe for blocking, block time 2138 * measured from when we were entered. 2139 */ 2140 if (sleep_start) { 2141 if (mutex->lck_mtx_is_ext == 0) { 2142 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex, 2143 mach_absolute_time() - sleep_start); 2144 } else { 2145 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex, 2146 mach_absolute_time() - sleep_start); 2147 } 2148 } 2149#endif 2150} 2151