kern_synch.c revision 248186
1/*- 2 * Copyright (c) 1982, 1986, 1990, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 35 */ 36 37#include <sys/cdefs.h> 38__FBSDID("$FreeBSD: head/sys/kern/kern_synch.c 248186 2013-03-12 06:58:49Z mav $"); 39 40#include "opt_kdtrace.h" 41#include "opt_ktrace.h" 42#include "opt_sched.h" 43 44#include <sys/param.h> 45#include <sys/systm.h> 46#include <sys/condvar.h> 47#include <sys/kdb.h> 48#include <sys/kernel.h> 49#include <sys/ktr.h> 50#include <sys/lock.h> 51#include <sys/mutex.h> 52#include <sys/proc.h> 53#include <sys/resourcevar.h> 54#include <sys/sched.h> 55#include <sys/sdt.h> 56#include <sys/signalvar.h> 57#include <sys/sleepqueue.h> 58#include <sys/smp.h> 59#include <sys/sx.h> 60#include <sys/sysctl.h> 61#include <sys/sysproto.h> 62#include <sys/vmmeter.h> 63#ifdef KTRACE 64#include <sys/uio.h> 65#include <sys/ktrace.h> 66#endif 67 68#include <machine/cpu.h> 69 70#ifdef XEN 71#include <vm/vm.h> 72#include <vm/vm_param.h> 73#include <vm/pmap.h> 74#endif 75 76#define KTDSTATE(td) \ 77 (((td)->td_inhibitors & TDI_SLEEPING) != 0 ? "sleep" : \ 78 ((td)->td_inhibitors & TDI_SUSPENDED) != 0 ? "suspended" : \ 79 ((td)->td_inhibitors & TDI_SWAPPED) != 0 ? "swapped" : \ 80 ((td)->td_inhibitors & TDI_LOCK) != 0 ? "blocked" : \ 81 ((td)->td_inhibitors & TDI_IWAIT) != 0 ? "iwait" : "yielding") 82 83static void synch_setup(void *dummy); 84SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup, 85 NULL); 86 87int hogticks; 88static uint8_t pause_wchan[MAXCPU]; 89 90static struct callout loadav_callout; 91 92struct loadavg averunnable = 93 { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */ 94/* 95 * Constants for averages over 1, 5, and 15 minutes 96 * when sampling at 5 second intervals. 97 */ 98static fixpt_t cexp[3] = { 99 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 100 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 101 0.9944598480048967 * FSCALE, /* exp(-1/180) */ 102}; 103 104/* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */ 105static int fscale __unused = FSCALE; 106SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, ""); 107 108static void loadav(void *arg); 109 110SDT_PROVIDER_DECLARE(sched); 111SDT_PROBE_DEFINE(sched, , , preempt, preempt); 112 113/* 114 * These probes reference Solaris features that are not implemented in FreeBSD. 115 * Create the probes anyway for compatibility with existing D scripts; they'll 116 * just never fire. 117 */ 118SDT_PROBE_DEFINE(sched, , , cpucaps_sleep, cpucaps-sleep); 119SDT_PROBE_DEFINE(sched, , , cpucaps_wakeup, cpucaps-wakeup); 120SDT_PROBE_DEFINE(sched, , , schedctl_nopreempt, schedctl-nopreempt); 121SDT_PROBE_DEFINE(sched, , , schedctl_preempt, schedctl-preempt); 122SDT_PROBE_DEFINE(sched, , , schedctl_yield, schedctl-yield); 123 124void 125sleepinit(void) 126{ 127 128 hogticks = (hz / 10) * 2; /* Default only. */ 129 init_sleepqueues(); 130} 131 132/* 133 * General sleep call. Suspends the current thread until a wakeup is 134 * performed on the specified identifier. The thread will then be made 135 * runnable with the specified priority. Sleeps at most timo/hz seconds 136 * (0 means no timeout). If pri includes the PCATCH flag, let signals 137 * interrupt the sleep, otherwise ignore them while sleeping. Returns 0 if 138 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 139 * signal becomes pending, ERESTART is returned if the current system 140 * call should be restarted if possible, and EINTR is returned if the system 141 * call should be interrupted by the signal (return EINTR). 142 * 143 * The lock argument is unlocked before the caller is suspended, and 144 * re-locked before _sleep() returns. If priority includes the PDROP 145 * flag the lock is not re-locked before returning. 146 */ 147int 148_sleep(void *ident, struct lock_object *lock, int priority, 149 const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) 150{ 151 struct thread *td; 152 struct proc *p; 153 struct lock_class *class; 154 int catch, lock_state, pri, rval, sleepq_flags; 155 WITNESS_SAVE_DECL(lock_witness); 156 157 td = curthread; 158 p = td->td_proc; 159#ifdef KTRACE 160 if (KTRPOINT(td, KTR_CSW)) 161 ktrcsw(1, 0, wmesg); 162#endif 163 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, 164 "Sleeping on \"%s\"", wmesg); 165 KASSERT(sbt != 0 || mtx_owned(&Giant) || lock != NULL, 166 ("sleeping without a lock")); 167 KASSERT(p != NULL, ("msleep1")); 168 KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep")); 169 if (priority & PDROP) 170 KASSERT(lock != NULL && lock != &Giant.lock_object, 171 ("PDROP requires a non-Giant lock")); 172 if (lock != NULL) 173 class = LOCK_CLASS(lock); 174 else 175 class = NULL; 176 177 if (cold || SCHEDULER_STOPPED()) { 178 /* 179 * During autoconfiguration, just return; 180 * don't run any other threads or panic below, 181 * in case this is the idle thread and already asleep. 182 * XXX: this used to do "s = splhigh(); splx(safepri); 183 * splx(s);" to give interrupts a chance, but there is 184 * no way to give interrupts a chance now. 185 */ 186 if (lock != NULL && priority & PDROP) 187 class->lc_unlock(lock); 188 return (0); 189 } 190 catch = priority & PCATCH; 191 pri = priority & PRIMASK; 192 193 /* 194 * If we are already on a sleep queue, then remove us from that 195 * sleep queue first. We have to do this to handle recursive 196 * sleeps. 197 */ 198 if (TD_ON_SLEEPQ(td)) 199 sleepq_remove(td, td->td_wchan); 200 201 if ((uint8_t *)ident >= &pause_wchan[0] && 202 (uint8_t *)ident <= &pause_wchan[MAXCPU - 1]) 203 sleepq_flags = SLEEPQ_PAUSE; 204 else 205 sleepq_flags = SLEEPQ_SLEEP; 206 if (catch) 207 sleepq_flags |= SLEEPQ_INTERRUPTIBLE; 208 if (priority & PBDRY) 209 sleepq_flags |= SLEEPQ_STOP_ON_BDRY; 210 211 sleepq_lock(ident); 212 CTR5(KTR_PROC, "sleep: thread %ld (pid %ld, %s) on %s (%p)", 213 td->td_tid, p->p_pid, td->td_name, wmesg, ident); 214 215 if (lock == &Giant.lock_object) 216 mtx_assert(&Giant, MA_OWNED); 217 DROP_GIANT(); 218 if (lock != NULL && lock != &Giant.lock_object && 219 !(class->lc_flags & LC_SLEEPABLE)) { 220 WITNESS_SAVE(lock, lock_witness); 221 lock_state = class->lc_unlock(lock); 222 } else 223 /* GCC needs to follow the Yellow Brick Road */ 224 lock_state = -1; 225 226 /* 227 * We put ourselves on the sleep queue and start our timeout 228 * before calling thread_suspend_check, as we could stop there, 229 * and a wakeup or a SIGCONT (or both) could occur while we were 230 * stopped without resuming us. Thus, we must be ready for sleep 231 * when cursig() is called. If the wakeup happens while we're 232 * stopped, then td will no longer be on a sleep queue upon 233 * return from cursig(). 234 */ 235 sleepq_add(ident, lock, wmesg, sleepq_flags, 0); 236 if (sbt != 0) 237 sleepq_set_timeout_sbt(ident, sbt, pr, flags); 238 if (lock != NULL && class->lc_flags & LC_SLEEPABLE) { 239 sleepq_release(ident); 240 WITNESS_SAVE(lock, lock_witness); 241 lock_state = class->lc_unlock(lock); 242 sleepq_lock(ident); 243 } 244 if (sbt != 0 && catch) 245 rval = sleepq_timedwait_sig(ident, pri); 246 else if (sbt != 0) 247 rval = sleepq_timedwait(ident, pri); 248 else if (catch) 249 rval = sleepq_wait_sig(ident, pri); 250 else { 251 sleepq_wait(ident, pri); 252 rval = 0; 253 } 254#ifdef KTRACE 255 if (KTRPOINT(td, KTR_CSW)) 256 ktrcsw(0, 0, wmesg); 257#endif 258 PICKUP_GIANT(); 259 if (lock != NULL && lock != &Giant.lock_object && !(priority & PDROP)) { 260 class->lc_lock(lock, lock_state); 261 WITNESS_RESTORE(lock, lock_witness); 262 } 263 return (rval); 264} 265 266int 267msleep_spin_sbt(void *ident, struct mtx *mtx, const char *wmesg, 268 sbintime_t sbt, sbintime_t pr, int flags) 269{ 270 struct thread *td; 271 struct proc *p; 272 int rval; 273 WITNESS_SAVE_DECL(mtx); 274 275 td = curthread; 276 p = td->td_proc; 277 KASSERT(mtx != NULL, ("sleeping without a mutex")); 278 KASSERT(p != NULL, ("msleep1")); 279 KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep")); 280 281 if (cold || SCHEDULER_STOPPED()) { 282 /* 283 * During autoconfiguration, just return; 284 * don't run any other threads or panic below, 285 * in case this is the idle thread and already asleep. 286 * XXX: this used to do "s = splhigh(); splx(safepri); 287 * splx(s);" to give interrupts a chance, but there is 288 * no way to give interrupts a chance now. 289 */ 290 return (0); 291 } 292 293 sleepq_lock(ident); 294 CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)", 295 td->td_tid, p->p_pid, td->td_name, wmesg, ident); 296 297 DROP_GIANT(); 298 mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED); 299 WITNESS_SAVE(&mtx->lock_object, mtx); 300 mtx_unlock_spin(mtx); 301 302 /* 303 * We put ourselves on the sleep queue and start our timeout. 304 */ 305 sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0); 306 if (sbt != 0) 307 sleepq_set_timeout_sbt(ident, sbt, pr, flags); 308 309 /* 310 * Can't call ktrace with any spin locks held so it can lock the 311 * ktrace_mtx lock, and WITNESS_WARN considers it an error to hold 312 * any spin lock. Thus, we have to drop the sleepq spin lock while 313 * we handle those requests. This is safe since we have placed our 314 * thread on the sleep queue already. 315 */ 316#ifdef KTRACE 317 if (KTRPOINT(td, KTR_CSW)) { 318 sleepq_release(ident); 319 ktrcsw(1, 0, wmesg); 320 sleepq_lock(ident); 321 } 322#endif 323#ifdef WITNESS 324 sleepq_release(ident); 325 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Sleeping on \"%s\"", 326 wmesg); 327 sleepq_lock(ident); 328#endif 329 if (sbt != 0) 330 rval = sleepq_timedwait(ident, 0); 331 else { 332 sleepq_wait(ident, 0); 333 rval = 0; 334 } 335#ifdef KTRACE 336 if (KTRPOINT(td, KTR_CSW)) 337 ktrcsw(0, 0, wmesg); 338#endif 339 PICKUP_GIANT(); 340 mtx_lock_spin(mtx); 341 WITNESS_RESTORE(&mtx->lock_object, mtx); 342 return (rval); 343} 344 345/* 346 * pause() delays the calling thread by the given number of system ticks. 347 * During cold bootup, pause() uses the DELAY() function instead of 348 * the tsleep() function to do the waiting. The "timo" argument must be 349 * greater than or equal to zero. A "timo" value of zero is equivalent 350 * to a "timo" value of one. 351 */ 352int 353pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) 354{ 355 int sbt_sec; 356 357 sbt_sec = sbintime_getsec(sbt); 358 KASSERT(sbt_sec >= 0, ("pause: timo must be >= 0")); 359 360 /* silently convert invalid timeouts */ 361 if (sbt == 0) 362 sbt = tick_sbt; 363 364 if (cold) { 365 /* 366 * We delay one second at a time to avoid overflowing the 367 * system specific DELAY() function(s): 368 */ 369 while (sbt_sec > 0) { 370 DELAY(1000000); 371 sbt_sec--; 372 } 373 DELAY((sbt & 0xffffffff) / SBT_1US); 374 return (0); 375 } 376 return (_sleep(&pause_wchan[curcpu], NULL, 0, wmesg, sbt, pr, flags)); 377} 378 379/* 380 * Make all threads sleeping on the specified identifier runnable. 381 */ 382void 383wakeup(void *ident) 384{ 385 int wakeup_swapper; 386 387 sleepq_lock(ident); 388 wakeup_swapper = sleepq_broadcast(ident, SLEEPQ_SLEEP, 0, 0); 389 sleepq_release(ident); 390 if (wakeup_swapper) { 391 KASSERT(ident != &proc0, 392 ("wakeup and wakeup_swapper and proc0")); 393 kick_proc0(); 394 } 395} 396 397/* 398 * Make a thread sleeping on the specified identifier runnable. 399 * May wake more than one thread if a target thread is currently 400 * swapped out. 401 */ 402void 403wakeup_one(void *ident) 404{ 405 int wakeup_swapper; 406 407 sleepq_lock(ident); 408 wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP, 0, 0); 409 sleepq_release(ident); 410 if (wakeup_swapper) 411 kick_proc0(); 412} 413 414static void 415kdb_switch(void) 416{ 417 thread_unlock(curthread); 418 kdb_backtrace(); 419 kdb_reenter(); 420 panic("%s: did not reenter debugger", __func__); 421} 422 423/* 424 * The machine independent parts of context switching. 425 */ 426void 427mi_switch(int flags, struct thread *newtd) 428{ 429 uint64_t runtime, new_switchtime; 430 struct thread *td; 431 struct proc *p; 432 433 td = curthread; /* XXX */ 434 THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED); 435 p = td->td_proc; /* XXX */ 436 KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code")); 437#ifdef INVARIANTS 438 if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td)) 439 mtx_assert(&Giant, MA_NOTOWNED); 440#endif 441 KASSERT(td->td_critnest == 1 || panicstr, 442 ("mi_switch: switch in a critical section")); 443 KASSERT((flags & (SW_INVOL | SW_VOL)) != 0, 444 ("mi_switch: switch must be voluntary or involuntary")); 445 KASSERT(newtd != curthread, ("mi_switch: preempting back to ourself")); 446 447 /* 448 * Don't perform context switches from the debugger. 449 */ 450 if (kdb_active) 451 kdb_switch(); 452 if (SCHEDULER_STOPPED()) 453 return; 454 if (flags & SW_VOL) { 455 td->td_ru.ru_nvcsw++; 456 td->td_swvoltick = ticks; 457 } else 458 td->td_ru.ru_nivcsw++; 459#ifdef SCHED_STATS 460 SCHED_STAT_INC(sched_switch_stats[flags & SW_TYPE_MASK]); 461#endif 462 /* 463 * Compute the amount of time during which the current 464 * thread was running, and add that to its total so far. 465 */ 466 new_switchtime = cpu_ticks(); 467 runtime = new_switchtime - PCPU_GET(switchtime); 468 td->td_runtime += runtime; 469 td->td_incruntime += runtime; 470 PCPU_SET(switchtime, new_switchtime); 471 td->td_generation++; /* bump preempt-detect counter */ 472 PCPU_INC(cnt.v_swtch); 473 PCPU_SET(switchticks, ticks); 474 CTR4(KTR_PROC, "mi_switch: old thread %ld (td_sched %p, pid %ld, %s)", 475 td->td_tid, td->td_sched, p->p_pid, td->td_name); 476#if (KTR_COMPILE & KTR_SCHED) != 0 477 if (TD_IS_IDLETHREAD(td)) 478 KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle", 479 "prio:%d", td->td_priority); 480 else 481 KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td), 482 "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg, 483 "lockname:\"%s\"", td->td_lockname); 484#endif 485 SDT_PROBE0(sched, , , preempt); 486#ifdef XEN 487 PT_UPDATES_FLUSH(); 488#endif 489 sched_switch(td, newtd, flags); 490 KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", 491 "prio:%d", td->td_priority); 492 493 CTR4(KTR_PROC, "mi_switch: new thread %ld (td_sched %p, pid %ld, %s)", 494 td->td_tid, td->td_sched, p->p_pid, td->td_name); 495 496 /* 497 * If the last thread was exiting, finish cleaning it up. 498 */ 499 if ((td = PCPU_GET(deadthread))) { 500 PCPU_SET(deadthread, NULL); 501 thread_stash(td); 502 } 503} 504 505/* 506 * Change thread state to be runnable, placing it on the run queue if 507 * it is in memory. If it is swapped out, return true so our caller 508 * will know to awaken the swapper. 509 */ 510int 511setrunnable(struct thread *td) 512{ 513 514 THREAD_LOCK_ASSERT(td, MA_OWNED); 515 KASSERT(td->td_proc->p_state != PRS_ZOMBIE, 516 ("setrunnable: pid %d is a zombie", td->td_proc->p_pid)); 517 switch (td->td_state) { 518 case TDS_RUNNING: 519 case TDS_RUNQ: 520 return (0); 521 case TDS_INHIBITED: 522 /* 523 * If we are only inhibited because we are swapped out 524 * then arange to swap in this process. Otherwise just return. 525 */ 526 if (td->td_inhibitors != TDI_SWAPPED) 527 return (0); 528 /* FALLTHROUGH */ 529 case TDS_CAN_RUN: 530 break; 531 default: 532 printf("state is 0x%x", td->td_state); 533 panic("setrunnable(2)"); 534 } 535 if ((td->td_flags & TDF_INMEM) == 0) { 536 if ((td->td_flags & TDF_SWAPINREQ) == 0) { 537 td->td_flags |= TDF_SWAPINREQ; 538 return (1); 539 } 540 } else 541 sched_wakeup(td); 542 return (0); 543} 544 545/* 546 * Compute a tenex style load average of a quantity on 547 * 1, 5 and 15 minute intervals. 548 */ 549static void 550loadav(void *arg) 551{ 552 int i, nrun; 553 struct loadavg *avg; 554 555 nrun = sched_load(); 556 avg = &averunnable; 557 558 for (i = 0; i < 3; i++) 559 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + 560 nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; 561 562 /* 563 * Schedule the next update to occur after 5 seconds, but add a 564 * random variation to avoid synchronisation with processes that 565 * run at regular intervals. 566 */ 567 callout_reset_sbt(&loadav_callout, 568 tick_sbt * (hz * 4 + (int)(random() % (hz * 2 + 1))), 0, 569 loadav, NULL, C_DIRECT_EXEC | C_HARDCLOCK); 570} 571 572/* ARGSUSED */ 573static void 574synch_setup(void *dummy) 575{ 576 callout_init(&loadav_callout, CALLOUT_MPSAFE); 577 578 /* Kick off timeout driven events by calling first time. */ 579 loadav(NULL); 580} 581 582int 583should_yield(void) 584{ 585 586 return (ticks - curthread->td_swvoltick >= hogticks); 587} 588 589void 590maybe_yield(void) 591{ 592 593 if (should_yield()) 594 kern_yield(PRI_USER); 595} 596 597void 598kern_yield(int prio) 599{ 600 struct thread *td; 601 602 td = curthread; 603 DROP_GIANT(); 604 thread_lock(td); 605 if (prio == PRI_USER) 606 prio = td->td_user_pri; 607 if (prio >= 0) 608 sched_prio(td, prio); 609 mi_switch(SW_VOL | SWT_RELINQUISH, NULL); 610 thread_unlock(td); 611 PICKUP_GIANT(); 612} 613 614/* 615 * General purpose yield system call. 616 */ 617int 618sys_yield(struct thread *td, struct yield_args *uap) 619{ 620 621 thread_lock(td); 622 if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) 623 sched_prio(td, PRI_MAX_TIMESHARE); 624 mi_switch(SW_VOL | SWT_RELINQUISH, NULL); 625 thread_unlock(td); 626 td->td_retval[0] = 0; 627 return (0); 628} 629