1/* $NetBSD: kern_synch.c,v 1.366 2023/11/22 13:18:48 riastradh Exp $ */ 2 3/*- 4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009, 2019, 2020, 2023 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 10 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and 11 * Daniel Sieger. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35/*- 36 * Copyright (c) 1982, 1986, 1990, 1991, 1993 37 * The Regents of the University of California. All rights reserved. 38 * (c) UNIX System Laboratories, Inc. 39 * All or some portions of this file are derived from material licensed 40 * to the University of California by American Telephone and Telegraph 41 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 42 * the permission of UNIX System Laboratories, Inc. 43 * 44 * Redistribution and use in source and binary forms, with or without 45 * modification, are permitted provided that the following conditions 46 * are met: 47 * 1. Redistributions of source code must retain the above copyright 48 * notice, this list of conditions and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 * 68 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 69 */ 70 71#include <sys/cdefs.h> 72__KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.366 2023/11/22 13:18:48 riastradh Exp $"); 73 74#include "opt_kstack.h" 75#include "opt_ddb.h" 76#include "opt_dtrace.h" 77 78#define __MUTEX_PRIVATE 79 80#include <sys/param.h> 81 82#include <sys/atomic.h> 83#include <sys/cpu.h> 84#include <sys/dtrace_bsd.h> 85#include <sys/evcnt.h> 86#include <sys/intr.h> 87#include <sys/kernel.h> 88#include <sys/lockdebug.h> 89#include <sys/lwpctl.h> 90#include <sys/proc.h> 91#include <sys/pserialize.h> 92#include <sys/resource.h> 93#include <sys/resourcevar.h> 94#include <sys/rwlock.h> 95#include <sys/sched.h> 96#include <sys/sleepq.h> 97#include <sys/syncobj.h> 98#include <sys/syscall_stats.h> 99#include <sys/syslog.h> 100#include <sys/systm.h> 101 102#include <uvm/uvm_extern.h> 103 104#include <dev/lockstat.h> 105 106int dtrace_vtime_active=0; 107dtrace_vtime_switch_func_t dtrace_vtime_switch_func; 108 109#ifdef DDB 110#include <ddb/ddb.h> 111#endif 112 113static void sched_unsleep(struct lwp *, bool); 114static void sched_changepri(struct lwp *, pri_t); 115static void sched_lendpri(struct lwp *, pri_t); 116 117syncobj_t sleep_syncobj = { 118 .sobj_name = "sleep", 119 .sobj_flag = SOBJ_SLEEPQ_SORTED, 120 .sobj_boostpri = PRI_KERNEL, 121 .sobj_unsleep = sleepq_unsleep, 122 .sobj_changepri = sleepq_changepri, 123 .sobj_lendpri = sleepq_lendpri, 124 .sobj_owner = syncobj_noowner, 125}; 126 127syncobj_t sched_syncobj = { 128 .sobj_name = "sched", 129 .sobj_flag = SOBJ_SLEEPQ_SORTED, 130 .sobj_boostpri = PRI_USER, 131 .sobj_unsleep = sched_unsleep, 132 .sobj_changepri = sched_changepri, 133 .sobj_lendpri = sched_lendpri, 134 .sobj_owner = syncobj_noowner, 135}; 136 137syncobj_t kpause_syncobj = { 138 .sobj_name = "kpause", 139 .sobj_flag = SOBJ_SLEEPQ_NULL, 140 .sobj_boostpri = PRI_KERNEL, 141 .sobj_unsleep = sleepq_unsleep, 142 .sobj_changepri = sleepq_changepri, 143 .sobj_lendpri = sleepq_lendpri, 144 .sobj_owner = syncobj_noowner, 145}; 146 147/* "Lightning bolt": once a second sleep address. */ 148kcondvar_t lbolt __cacheline_aligned; 149 150u_int sched_pstats_ticks __cacheline_aligned; 151 152/* Preemption event counters. */ 153static struct evcnt kpreempt_ev_crit __cacheline_aligned; 154static struct evcnt kpreempt_ev_klock __cacheline_aligned; 155static struct evcnt kpreempt_ev_immed __cacheline_aligned; 156 157void 158synch_init(void) 159{ 160 161 cv_init(&lbolt, "lbolt"); 162 163 evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL, 164 "kpreempt", "defer: critical section"); 165 evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL, 166 "kpreempt", "defer: kernel_lock"); 167 evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL, 168 "kpreempt", "immediate"); 169} 170 171/* 172 * OBSOLETE INTERFACE 173 * 174 * General sleep call. Suspends the current LWP until a wakeup is 175 * performed on the specified identifier. The LWP will then be made 176 * runnable with the specified priority. Sleeps at most timo/hz seconds (0 177 * means no timeout). If pri includes PCATCH flag, signals are checked 178 * before and after sleeping, else signals are not checked. Returns 0 if 179 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 180 * signal needs to be delivered, ERESTART is returned if the current system 181 * call should be restarted if possible, and EINTR is returned if the system 182 * call should be interrupted by the signal (return EINTR). 183 */ 184int 185tsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo) 186{ 187 struct lwp *l = curlwp; 188 sleepq_t *sq; 189 kmutex_t *mp; 190 bool catch_p; 191 int nlocks; 192 193 KASSERT((l->l_pflag & LP_INTR) == 0); 194 KASSERT(ident != &lbolt); 195 //KASSERT(KERNEL_LOCKED_P()); 196 197 if (sleepq_dontsleep(l)) { 198 (void)sleepq_abort(NULL, 0); 199 return 0; 200 } 201 202 catch_p = priority & PCATCH; 203 sq = sleeptab_lookup(&sleeptab, ident, &mp); 204 nlocks = sleepq_enter(sq, l, mp); 205 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj, catch_p); 206 return sleepq_block(timo, catch_p, &sleep_syncobj, nlocks); 207} 208 209int 210mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 211 kmutex_t *mtx) 212{ 213 struct lwp *l = curlwp; 214 sleepq_t *sq; 215 kmutex_t *mp; 216 bool catch_p; 217 int error, nlocks; 218 219 KASSERT((l->l_pflag & LP_INTR) == 0); 220 KASSERT(ident != &lbolt); 221 222 if (sleepq_dontsleep(l)) { 223 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0); 224 return 0; 225 } 226 227 catch_p = priority & PCATCH; 228 sq = sleeptab_lookup(&sleeptab, ident, &mp); 229 nlocks = sleepq_enter(sq, l, mp); 230 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj, catch_p); 231 mutex_exit(mtx); 232 error = sleepq_block(timo, catch_p, &sleep_syncobj, nlocks); 233 234 if ((priority & PNORELOCK) == 0) 235 mutex_enter(mtx); 236 237 return error; 238} 239 240/* 241 * General sleep call for situations where a wake-up is not expected. 242 */ 243int 244kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx) 245{ 246 struct lwp *l = curlwp; 247 int error, nlocks; 248 249 KASSERTMSG(timo != 0 || intr, "wmesg=%s intr=%s timo=%d mtx=%p", 250 wmesg, intr ? "true" : "false", timo, mtx); 251 252 if (sleepq_dontsleep(l)) 253 return sleepq_abort(NULL, 0); 254 255 if (mtx != NULL) 256 mutex_exit(mtx); 257 nlocks = sleepq_enter(NULL, l, NULL); 258 sleepq_enqueue(NULL, l, wmesg, &kpause_syncobj, intr); 259 error = sleepq_block(timo, intr, &kpause_syncobj, nlocks); 260 if (mtx != NULL) 261 mutex_enter(mtx); 262 263 return error; 264} 265 266/* 267 * OBSOLETE INTERFACE 268 * 269 * Make all LWPs sleeping on the specified identifier runnable. 270 */ 271void 272wakeup(wchan_t ident) 273{ 274 sleepq_t *sq; 275 kmutex_t *mp; 276 277 if (__predict_false(cold)) 278 return; 279 280 sq = sleeptab_lookup(&sleeptab, ident, &mp); 281 sleepq_wake(sq, ident, (u_int)-1, mp); 282} 283 284/* 285 * General yield call. Puts the current LWP back on its run queue and 286 * performs a context switch. 287 */ 288void 289yield(void) 290{ 291 struct lwp *l = curlwp; 292 int nlocks; 293 294 KERNEL_UNLOCK_ALL(l, &nlocks); 295 lwp_lock(l); 296 297 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 298 KASSERT(l->l_stat == LSONPROC); 299 300 spc_lock(l->l_cpu); 301 mi_switch(l); 302 KERNEL_LOCK(nlocks, l); 303} 304 305/* 306 * General preemption call. Puts the current LWP back on its run queue 307 * and performs an involuntary context switch. Different from yield() 308 * in that: 309 * 310 * - It's counted differently (involuntary vs. voluntary). 311 * - Realtime threads go to the head of their runqueue vs. tail for yield(). 312 */ 313void 314preempt(void) 315{ 316 struct lwp *l = curlwp; 317 int nlocks; 318 319 KERNEL_UNLOCK_ALL(l, &nlocks); 320 lwp_lock(l); 321 322 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 323 KASSERT(l->l_stat == LSONPROC); 324 325 spc_lock(l->l_cpu); 326 l->l_pflag |= LP_PREEMPTING; 327 mi_switch(l); 328 KERNEL_LOCK(nlocks, l); 329} 330 331/* 332 * Return true if the current LWP should yield the processor. Intended to 333 * be used by long-running code in kernel. 334 */ 335inline bool 336preempt_needed(void) 337{ 338 lwp_t *l = curlwp; 339 int needed; 340 341 KPREEMPT_DISABLE(l); 342 needed = l->l_cpu->ci_want_resched; 343 KPREEMPT_ENABLE(l); 344 345 return (needed != 0); 346} 347 348/* 349 * A breathing point for long running code in kernel. 350 */ 351void 352preempt_point(void) 353{ 354 355 if (__predict_false(preempt_needed())) { 356 preempt(); 357 } 358} 359 360/* 361 * Handle a request made by another agent to preempt the current LWP 362 * in-kernel. Usually called when l_dopreempt may be non-zero. 363 * 364 * Character addresses for lockstat only. 365 */ 366static char kpreempt_is_disabled; 367static char kernel_lock_held; 368static char is_softint_lwp; 369static char spl_is_raised; 370 371bool 372kpreempt(uintptr_t where) 373{ 374 uintptr_t failed; 375 lwp_t *l; 376 int s, dop, lsflag; 377 378 l = curlwp; 379 failed = 0; 380 while ((dop = l->l_dopreempt) != 0) { 381 if (l->l_stat != LSONPROC) { 382 /* 383 * About to block (or die), let it happen. 384 * Doesn't really count as "preemption has 385 * been blocked", since we're going to 386 * context switch. 387 */ 388 atomic_swap_uint(&l->l_dopreempt, 0); 389 return true; 390 } 391 KASSERT((l->l_flag & LW_IDLE) == 0); 392 if (__predict_false(l->l_nopreempt != 0)) { 393 /* LWP holds preemption disabled, explicitly. */ 394 if ((dop & DOPREEMPT_COUNTED) == 0) { 395 kpreempt_ev_crit.ev_count++; 396 } 397 failed = (uintptr_t)&kpreempt_is_disabled; 398 break; 399 } 400 if (__predict_false((l->l_pflag & LP_INTR) != 0)) { 401 /* Can't preempt soft interrupts yet. */ 402 atomic_swap_uint(&l->l_dopreempt, 0); 403 failed = (uintptr_t)&is_softint_lwp; 404 break; 405 } 406 s = splsched(); 407 if (__predict_false(l->l_blcnt != 0 || 408 curcpu()->ci_biglock_wanted != NULL)) { 409 /* Hold or want kernel_lock, code is not MT safe. */ 410 splx(s); 411 if ((dop & DOPREEMPT_COUNTED) == 0) { 412 kpreempt_ev_klock.ev_count++; 413 } 414 failed = (uintptr_t)&kernel_lock_held; 415 break; 416 } 417 if (__predict_false(!cpu_kpreempt_enter(where, s))) { 418 /* 419 * It may be that the IPL is too high. 420 * kpreempt_enter() can schedule an 421 * interrupt to retry later. 422 */ 423 splx(s); 424 failed = (uintptr_t)&spl_is_raised; 425 break; 426 } 427 /* Do it! */ 428 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) { 429 kpreempt_ev_immed.ev_count++; 430 } 431 lwp_lock(l); 432 l->l_pflag |= LP_PREEMPTING; 433 spc_lock(l->l_cpu); 434 mi_switch(l); 435 l->l_nopreempt++; 436 splx(s); 437 438 /* Take care of any MD cleanup. */ 439 cpu_kpreempt_exit(where); 440 l->l_nopreempt--; 441 } 442 443 if (__predict_true(!failed)) { 444 return false; 445 } 446 447 /* Record preemption failure for reporting via lockstat. */ 448 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED); 449 lsflag = 0; 450 LOCKSTAT_ENTER(lsflag); 451 if (__predict_false(lsflag)) { 452 if (where == 0) { 453 where = (uintptr_t)__builtin_return_address(0); 454 } 455 /* Preemption is on, might recurse, so make it atomic. */ 456 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, NULL, 457 (void *)where) == NULL) { 458 LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime); 459 l->l_pfaillock = failed; 460 } 461 } 462 LOCKSTAT_EXIT(lsflag); 463 return true; 464} 465 466/* 467 * Return true if preemption is explicitly disabled. 468 */ 469bool 470kpreempt_disabled(void) 471{ 472 const lwp_t *l = curlwp; 473 474 return l->l_nopreempt != 0 || l->l_stat == LSZOMB || 475 (l->l_flag & LW_IDLE) != 0 || (l->l_pflag & LP_INTR) != 0 || 476 cpu_kpreempt_disabled(); 477} 478 479/* 480 * Disable kernel preemption. 481 */ 482void 483kpreempt_disable(void) 484{ 485 486 KPREEMPT_DISABLE(curlwp); 487} 488 489/* 490 * Reenable kernel preemption. 491 */ 492void 493kpreempt_enable(void) 494{ 495 496 KPREEMPT_ENABLE(curlwp); 497} 498 499/* 500 * Compute the amount of time during which the current lwp was running. 501 * 502 * - update l_rtime unless it's an idle lwp. 503 */ 504 505void 506updatertime(lwp_t *l, const struct bintime *now) 507{ 508 static bool backwards = false; 509 510 if (__predict_false(l->l_flag & LW_IDLE)) 511 return; 512 513 if (__predict_false(bintimecmp(now, &l->l_stime, <)) && !backwards) { 514 char caller[128]; 515 516#ifdef DDB 517 db_symstr(caller, sizeof(caller), 518 (db_expr_t)(intptr_t)__builtin_return_address(0), 519 DB_STGY_PROC); 520#else 521 snprintf(caller, sizeof(caller), "%p", 522 __builtin_return_address(0)); 523#endif 524 backwards = true; 525 printf("WARNING: lwp %ld (%s%s%s) flags 0x%x:" 526 " timecounter went backwards" 527 " from (%jd + 0x%016"PRIx64"/2^64) sec" 528 " to (%jd + 0x%016"PRIx64"/2^64) sec" 529 " in %s\n", 530 (long)l->l_lid, 531 l->l_proc->p_comm, 532 l->l_name ? " " : "", 533 l->l_name ? l->l_name : "", 534 l->l_pflag, 535 (intmax_t)l->l_stime.sec, l->l_stime.frac, 536 (intmax_t)now->sec, now->frac, 537 caller); 538 } 539 540 /* rtime += now - stime */ 541 bintime_add(&l->l_rtime, now); 542 bintime_sub(&l->l_rtime, &l->l_stime); 543} 544 545/* 546 * Select next LWP from the current CPU to run.. 547 */ 548static inline lwp_t * 549nextlwp(struct cpu_info *ci, struct schedstate_percpu *spc) 550{ 551 lwp_t *newl; 552 553 /* 554 * Let sched_nextlwp() select the LWP to run the CPU next. 555 * If no LWP is runnable, select the idle LWP. 556 * 557 * On arrival here LWPs on a run queue are locked by spc_mutex which 558 * is currently held. Idle LWPs are always locked by spc_lwplock, 559 * which may or may not be held here. On exit from this code block, 560 * in all cases newl is locked by spc_lwplock. 561 */ 562 newl = sched_nextlwp(); 563 if (newl != NULL) { 564 sched_dequeue(newl); 565 KASSERT(lwp_locked(newl, spc->spc_mutex)); 566 KASSERT(newl->l_cpu == ci); 567 newl->l_stat = LSONPROC; 568 newl->l_pflag |= LP_RUNNING; 569 newl->l_boostpri = PRI_NONE; 570 spc->spc_curpriority = lwp_eprio(newl); 571 spc->spc_flags &= ~(SPCF_SWITCHCLEAR | SPCF_IDLE); 572 lwp_setlock(newl, spc->spc_lwplock); 573 } else { 574 /* 575 * The idle LWP does not get set to LSONPROC, because 576 * otherwise it screws up the output from top(1) etc. 577 */ 578 newl = ci->ci_data.cpu_idlelwp; 579 newl->l_pflag |= LP_RUNNING; 580 spc->spc_curpriority = PRI_IDLE; 581 spc->spc_flags = (spc->spc_flags & ~SPCF_SWITCHCLEAR) | 582 SPCF_IDLE; 583 } 584 585 /* 586 * Only clear want_resched if there are no pending (slow) software 587 * interrupts. We can do this without an atomic, because no new 588 * LWPs can appear in the queue due to our hold on spc_mutex, and 589 * the update to ci_want_resched will become globally visible before 590 * the release of spc_mutex becomes globally visible. 591 */ 592 if (ci->ci_data.cpu_softints == 0) 593 ci->ci_want_resched = 0; 594 595 return newl; 596} 597 598/* 599 * The machine independent parts of context switch. 600 * 601 * NOTE: l->l_cpu is not changed in this routine, because an LWP never 602 * changes its own l_cpu (that would screw up curcpu on many ports and could 603 * cause all kinds of other evil stuff). l_cpu is always changed by some 604 * other actor, when it's known the LWP is not running (the LP_RUNNING flag 605 * is checked under lock). 606 */ 607void 608mi_switch(lwp_t *l) 609{ 610 struct cpu_info *ci; 611 struct schedstate_percpu *spc; 612 struct lwp *newl; 613 kmutex_t *lock; 614 int oldspl; 615 struct bintime bt; 616 bool returning; 617 618 KASSERT(lwp_locked(l, NULL)); 619 KASSERT(kpreempt_disabled()); 620 KASSERT(mutex_owned(curcpu()->ci_schedstate.spc_mutex)); 621 KASSERTMSG(l->l_blcnt == 0, "kernel_lock leaked"); 622 623 kstack_check_magic(l); 624 625 binuptime(&bt); 626 627 KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp); 628 KASSERT((l->l_pflag & LP_RUNNING) != 0); 629 KASSERT(l->l_cpu == curcpu() || l->l_stat == LSRUN); 630 ci = curcpu(); 631 spc = &ci->ci_schedstate; 632 returning = false; 633 newl = NULL; 634 635 /* 636 * If we have been asked to switch to a specific LWP, then there 637 * is no need to inspect the run queues. If a soft interrupt is 638 * blocking, then return to the interrupted thread without adjusting 639 * VM context or its start time: neither have been changed in order 640 * to take the interrupt. 641 */ 642 if (l->l_switchto != NULL) { 643 if ((l->l_pflag & LP_INTR) != 0) { 644 returning = true; 645 softint_block(l); 646 if ((l->l_pflag & LP_TIMEINTR) != 0) 647 updatertime(l, &bt); 648 } 649 newl = l->l_switchto; 650 l->l_switchto = NULL; 651 } 652#ifndef __HAVE_FAST_SOFTINTS 653 else if (ci->ci_data.cpu_softints != 0) { 654 /* There are pending soft interrupts, so pick one. */ 655 newl = softint_picklwp(); 656 newl->l_stat = LSONPROC; 657 newl->l_pflag |= LP_RUNNING; 658 } 659#endif /* !__HAVE_FAST_SOFTINTS */ 660 661 /* 662 * If on the CPU and we have gotten this far, then we must yield. 663 */ 664 if (l->l_stat == LSONPROC && l != newl) { 665 KASSERT(lwp_locked(l, spc->spc_lwplock)); 666 KASSERT((l->l_flag & LW_IDLE) == 0); 667 l->l_stat = LSRUN; 668 lwp_setlock(l, spc->spc_mutex); 669 sched_enqueue(l); 670 sched_preempted(l); 671 672 /* 673 * Handle migration. Note that "migrating LWP" may 674 * be reset here, if interrupt/preemption happens 675 * early in idle LWP. 676 */ 677 if (l->l_target_cpu != NULL && (l->l_pflag & LP_BOUND) == 0) { 678 KASSERT((l->l_pflag & LP_INTR) == 0); 679 spc->spc_migrating = l; 680 } 681 } 682 683 /* Pick new LWP to run. */ 684 if (newl == NULL) { 685 newl = nextlwp(ci, spc); 686 } 687 688 /* Items that must be updated with the CPU locked. */ 689 if (!returning) { 690 /* Count time spent in current system call */ 691 SYSCALL_TIME_SLEEP(l); 692 693 updatertime(l, &bt); 694 695 /* Update the new LWP's start time. */ 696 newl->l_stime = bt; 697 698 /* 699 * ci_curlwp changes when a fast soft interrupt occurs. 700 * We use ci_onproc to keep track of which kernel or 701 * user thread is running 'underneath' the software 702 * interrupt. This is important for time accounting, 703 * itimers and forcing user threads to preempt (aston). 704 */ 705 ci->ci_onproc = newl; 706 } 707 708 /* 709 * Preemption related tasks. Must be done holding spc_mutex. Clear 710 * l_dopreempt without an atomic - it's only ever set non-zero by 711 * sched_resched_cpu() which also holds spc_mutex, and only ever 712 * cleared by the LWP itself (us) with atomics when not under lock. 713 */ 714 l->l_dopreempt = 0; 715 if (__predict_false(l->l_pfailaddr != 0)) { 716 LOCKSTAT_FLAG(lsflag); 717 LOCKSTAT_ENTER(lsflag); 718 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime); 719 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN, 720 1, l->l_pfailtime, l->l_pfailaddr); 721 LOCKSTAT_EXIT(lsflag); 722 l->l_pfailtime = 0; 723 l->l_pfaillock = 0; 724 l->l_pfailaddr = 0; 725 } 726 727 if (l != newl) { 728 struct lwp *prevlwp; 729 730 /* Release all locks, but leave the current LWP locked */ 731 if (l->l_mutex == spc->spc_mutex) { 732 /* 733 * Drop spc_lwplock, if the current LWP has been moved 734 * to the run queue (it is now locked by spc_mutex). 735 */ 736 mutex_spin_exit(spc->spc_lwplock); 737 } else { 738 /* 739 * Otherwise, drop the spc_mutex, we are done with the 740 * run queues. 741 */ 742 mutex_spin_exit(spc->spc_mutex); 743 } 744 745 /* We're down to only one lock, so do debug checks. */ 746 LOCKDEBUG_BARRIER(l->l_mutex, 1); 747 748 /* Count the context switch. */ 749 CPU_COUNT(CPU_COUNT_NSWTCH, 1); 750 if ((l->l_pflag & LP_PREEMPTING) != 0) { 751 l->l_ru.ru_nivcsw++; 752 l->l_pflag &= ~LP_PREEMPTING; 753 } else { 754 l->l_ru.ru_nvcsw++; 755 } 756 757 /* 758 * Increase the count of spin-mutexes before the release 759 * of the last lock - we must remain at IPL_SCHED after 760 * releasing the lock. 761 */ 762 KASSERTMSG(ci->ci_mtx_count == -1, 763 "%s: cpu%u: ci_mtx_count (%d) != -1 " 764 "(block with spin-mutex held)", 765 __func__, cpu_index(ci), ci->ci_mtx_count); 766 oldspl = MUTEX_SPIN_OLDSPL(ci); 767 ci->ci_mtx_count = -2; 768 769 /* Update status for lwpctl, if present. */ 770 if (l->l_lwpctl != NULL) { 771 l->l_lwpctl->lc_curcpu = (l->l_stat == LSZOMB ? 772 LWPCTL_CPU_EXITED : LWPCTL_CPU_NONE); 773 } 774 775 /* 776 * If curlwp is a soft interrupt LWP, there's nobody on the 777 * other side to unlock - we're returning into an assembly 778 * trampoline. Unlock now. This is safe because this is a 779 * kernel LWP and is bound to current CPU: the worst anyone 780 * else will do to it, is to put it back onto this CPU's run 781 * queue (and the CPU is busy here right now!). 782 */ 783 if (returning) { 784 /* Keep IPL_SCHED after this; MD code will fix up. */ 785 l->l_pflag &= ~LP_RUNNING; 786 lwp_unlock(l); 787 } else { 788 /* A normal LWP: save old VM context. */ 789 pmap_deactivate(l); 790 } 791 792 /* 793 * If DTrace has set the active vtime enum to anything 794 * other than INACTIVE (0), then it should have set the 795 * function to call. 796 */ 797 if (__predict_false(dtrace_vtime_active)) { 798 (*dtrace_vtime_switch_func)(newl); 799 } 800 801 /* 802 * We must ensure not to come here from inside a read section. 803 */ 804 KASSERT(pserialize_not_in_read_section()); 805 806 /* Switch to the new LWP.. */ 807#ifdef MULTIPROCESSOR 808 KASSERT(curlwp == ci->ci_curlwp); 809#endif 810 KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp); 811 prevlwp = cpu_switchto(l, newl, returning); 812 ci = curcpu(); 813#ifdef MULTIPROCESSOR 814 KASSERT(curlwp == ci->ci_curlwp); 815#endif 816 KASSERTMSG(l == curlwp, "l %p curlwp %p prevlwp %p", 817 l, curlwp, prevlwp); 818 KASSERT(prevlwp != NULL); 819 KASSERT(l->l_cpu == ci); 820 KASSERT(ci->ci_mtx_count == -2); 821 822 /* 823 * Immediately mark the previous LWP as no longer running 824 * and unlock (to keep lock wait times short as possible). 825 * We'll still be at IPL_SCHED afterwards. If a zombie, 826 * don't touch after clearing LP_RUNNING as it could be 827 * reaped by another CPU. Issue a memory barrier to ensure 828 * this. 829 * 830 * atomic_store_release matches atomic_load_acquire in 831 * lwp_free. 832 */ 833 KASSERT((prevlwp->l_pflag & LP_RUNNING) != 0); 834 lock = prevlwp->l_mutex; 835 if (__predict_false(prevlwp->l_stat == LSZOMB)) { 836 atomic_store_release(&prevlwp->l_pflag, 837 prevlwp->l_pflag & ~LP_RUNNING); 838 } else { 839 prevlwp->l_pflag &= ~LP_RUNNING; 840 } 841 mutex_spin_exit(lock); 842 843 /* 844 * Switched away - we have new curlwp. 845 * Restore VM context and IPL. 846 */ 847 pmap_activate(l); 848 pcu_switchpoint(l); 849 850 /* Update status for lwpctl, if present. */ 851 if (l->l_lwpctl != NULL) { 852 l->l_lwpctl->lc_curcpu = (int)cpu_index(ci); 853 l->l_lwpctl->lc_pctr++; 854 } 855 856 /* 857 * Normalize the spin mutex count and restore the previous 858 * SPL. Note that, unless the caller disabled preemption, 859 * we can be preempted at any time after this splx(). 860 */ 861 KASSERT(l->l_cpu == ci); 862 KASSERT(ci->ci_mtx_count == -1); 863 ci->ci_mtx_count = 0; 864 splx(oldspl); 865 } else { 866 /* Nothing to do - just unlock and return. */ 867 mutex_spin_exit(spc->spc_mutex); 868 l->l_pflag &= ~LP_PREEMPTING; 869 lwp_unlock(l); 870 } 871 872 KASSERT(l == curlwp); 873 KASSERT(l->l_stat == LSONPROC || (l->l_flag & LW_IDLE) != 0); 874 875 SYSCALL_TIME_WAKEUP(l); 876 LOCKDEBUG_BARRIER(NULL, 1); 877} 878 879/* 880 * setrunnable: change LWP state to be runnable, placing it on the run queue. 881 * 882 * Call with the process and LWP locked. Will return with the LWP unlocked. 883 */ 884void 885setrunnable(struct lwp *l) 886{ 887 struct proc *p = l->l_proc; 888 struct cpu_info *ci; 889 kmutex_t *oldlock; 890 891 KASSERT((l->l_flag & LW_IDLE) == 0); 892 KASSERT((l->l_flag & LW_DBGSUSPEND) == 0); 893 KASSERT(mutex_owned(p->p_lock)); 894 KASSERT(lwp_locked(l, NULL)); 895 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex); 896 897 switch (l->l_stat) { 898 case LSSTOP: 899 /* 900 * If we're being traced (possibly because someone attached us 901 * while we were stopped), check for a signal from the debugger. 902 */ 903 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xsig != 0) 904 signotify(l); 905 p->p_nrlwps++; 906 break; 907 case LSSUSPENDED: 908 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 909 l->l_flag &= ~LW_WSUSPEND; 910 p->p_nrlwps++; 911 cv_broadcast(&p->p_lwpcv); 912 break; 913 case LSSLEEP: 914 KASSERT(l->l_wchan != NULL); 915 break; 916 case LSIDL: 917 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 918 break; 919 default: 920 panic("setrunnable: lwp %p state was %d", l, l->l_stat); 921 } 922 923 /* 924 * If the LWP was sleeping, start it again. 925 */ 926 if (l->l_wchan != NULL) { 927 l->l_stat = LSSLEEP; 928 /* lwp_unsleep() will release the lock. */ 929 lwp_unsleep(l, true); 930 return; 931 } 932 933 /* 934 * If the LWP is still on the CPU, mark it as LSONPROC. It may be 935 * about to call mi_switch(), in which case it will yield. 936 */ 937 if ((l->l_pflag & LP_RUNNING) != 0) { 938 l->l_stat = LSONPROC; 939 l->l_slptime = 0; 940 lwp_unlock(l); 941 return; 942 } 943 944 /* 945 * Look for a CPU to run. 946 * Set the LWP runnable. 947 */ 948 ci = sched_takecpu(l); 949 l->l_cpu = ci; 950 spc_lock(ci); 951 oldlock = lwp_setlock(l, l->l_cpu->ci_schedstate.spc_mutex); 952 sched_setrunnable(l); 953 l->l_stat = LSRUN; 954 l->l_slptime = 0; 955 sched_enqueue(l); 956 sched_resched_lwp(l, true); 957 /* SPC & LWP now unlocked. */ 958 mutex_spin_exit(oldlock); 959} 960 961/* 962 * suspendsched: 963 * 964 * Convert all non-LW_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED. 965 */ 966void 967suspendsched(void) 968{ 969 CPU_INFO_ITERATOR cii; 970 struct cpu_info *ci; 971 struct lwp *l; 972 struct proc *p; 973 974 /* 975 * We do this by process in order not to violate the locking rules. 976 */ 977 mutex_enter(&proc_lock); 978 PROCLIST_FOREACH(p, &allproc) { 979 mutex_enter(p->p_lock); 980 if ((p->p_flag & PK_SYSTEM) != 0) { 981 mutex_exit(p->p_lock); 982 continue; 983 } 984 985 if (p->p_stat != SSTOP) { 986 if (p->p_stat != SZOMB && p->p_stat != SDEAD) { 987 p->p_pptr->p_nstopchild++; 988 p->p_waited = 0; 989 } 990 p->p_stat = SSTOP; 991 } 992 993 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 994 if (l == curlwp) 995 continue; 996 997 lwp_lock(l); 998 999 /* 1000 * Set L_WREBOOT so that the LWP will suspend itself 1001 * when it tries to return to user mode. We want to 1002 * try and get to get as many LWPs as possible to 1003 * the user / kernel boundary, so that they will 1004 * release any locks that they hold. 1005 */ 1006 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND); 1007 1008 if (l->l_stat == LSSLEEP && 1009 (l->l_flag & LW_SINTR) != 0) { 1010 /* setrunnable() will release the lock. */ 1011 setrunnable(l); 1012 continue; 1013 } 1014 1015 lwp_unlock(l); 1016 } 1017 1018 mutex_exit(p->p_lock); 1019 } 1020 mutex_exit(&proc_lock); 1021 1022 /* 1023 * Kick all CPUs to make them preempt any LWPs running in user mode. 1024 * They'll trap into the kernel and suspend themselves in userret(). 1025 * 1026 * Unusually, we don't hold any other scheduler object locked, which 1027 * would keep preemption off for sched_resched_cpu(), so disable it 1028 * explicitly. 1029 */ 1030 kpreempt_disable(); 1031 for (CPU_INFO_FOREACH(cii, ci)) { 1032 spc_lock(ci); 1033 sched_resched_cpu(ci, PRI_KERNEL, true); 1034 /* spc now unlocked */ 1035 } 1036 kpreempt_enable(); 1037} 1038 1039/* 1040 * sched_unsleep: 1041 * 1042 * The is called when the LWP has not been awoken normally but instead 1043 * interrupted: for example, if the sleep timed out. Because of this, 1044 * it's not a valid action for running or idle LWPs. 1045 */ 1046static void 1047sched_unsleep(struct lwp *l, bool cleanup) 1048{ 1049 1050 lwp_unlock(l); 1051 panic("sched_unsleep"); 1052} 1053 1054static void 1055sched_changepri(struct lwp *l, pri_t pri) 1056{ 1057 struct schedstate_percpu *spc; 1058 struct cpu_info *ci; 1059 1060 KASSERT(lwp_locked(l, NULL)); 1061 1062 ci = l->l_cpu; 1063 spc = &ci->ci_schedstate; 1064 1065 if (l->l_stat == LSRUN) { 1066 KASSERT(lwp_locked(l, spc->spc_mutex)); 1067 sched_dequeue(l); 1068 l->l_priority = pri; 1069 sched_enqueue(l); 1070 sched_resched_lwp(l, false); 1071 } else if (l->l_stat == LSONPROC && l->l_class != SCHED_OTHER) { 1072 /* On priority drop, only evict realtime LWPs. */ 1073 KASSERT(lwp_locked(l, spc->spc_lwplock)); 1074 l->l_priority = pri; 1075 spc_lock(ci); 1076 sched_resched_cpu(ci, spc->spc_maxpriority, true); 1077 /* spc now unlocked */ 1078 } else { 1079 l->l_priority = pri; 1080 } 1081} 1082 1083static void 1084sched_lendpri(struct lwp *l, pri_t pri) 1085{ 1086 struct schedstate_percpu *spc; 1087 struct cpu_info *ci; 1088 1089 KASSERT(lwp_locked(l, NULL)); 1090 1091 ci = l->l_cpu; 1092 spc = &ci->ci_schedstate; 1093 1094 if (l->l_stat == LSRUN) { 1095 KASSERT(lwp_locked(l, spc->spc_mutex)); 1096 sched_dequeue(l); 1097 l->l_inheritedprio = pri; 1098 l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio); 1099 sched_enqueue(l); 1100 sched_resched_lwp(l, false); 1101 } else if (l->l_stat == LSONPROC && l->l_class != SCHED_OTHER) { 1102 /* On priority drop, only evict realtime LWPs. */ 1103 KASSERT(lwp_locked(l, spc->spc_lwplock)); 1104 l->l_inheritedprio = pri; 1105 l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio); 1106 spc_lock(ci); 1107 sched_resched_cpu(ci, spc->spc_maxpriority, true); 1108 /* spc now unlocked */ 1109 } else { 1110 l->l_inheritedprio = pri; 1111 l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio); 1112 } 1113} 1114 1115struct lwp * 1116syncobj_noowner(wchan_t wchan) 1117{ 1118 1119 return NULL; 1120} 1121 1122/* Decay 95% of proc::p_pctcpu in 60 seconds, ccpu = exp(-1/20) */ 1123const fixpt_t ccpu = 0.95122942450071400909 * FSCALE; 1124 1125/* 1126 * Constants for averages over 1, 5 and 15 minutes when sampling at 1127 * 5 second intervals. 1128 */ 1129static const fixpt_t cexp[ ] = { 1130 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 1131 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 1132 0.9944598480048967 * FSCALE, /* exp(-1/180) */ 1133}; 1134 1135/* 1136 * sched_pstats: 1137 * 1138 * => Update process statistics and check CPU resource allocation. 1139 * => Call scheduler-specific hook to eventually adjust LWP priorities. 1140 * => Compute load average of a quantity on 1, 5 and 15 minute intervals. 1141 */ 1142void 1143sched_pstats(void) 1144{ 1145 struct loadavg *avg = &averunnable; 1146 const int clkhz = (stathz != 0 ? stathz : hz); 1147 static bool backwardslwp = false; 1148 static bool backwardsproc = false; 1149 static u_int lavg_count = 0; 1150 struct proc *p; 1151 int nrun; 1152 1153 sched_pstats_ticks++; 1154 if (++lavg_count >= 5) { 1155 lavg_count = 0; 1156 nrun = 0; 1157 } 1158 mutex_enter(&proc_lock); 1159 PROCLIST_FOREACH(p, &allproc) { 1160 struct lwp *l; 1161 struct rlimit *rlim; 1162 time_t runtm; 1163 int sig; 1164 1165 /* Increment sleep time (if sleeping), ignore overflow. */ 1166 mutex_enter(p->p_lock); 1167 runtm = p->p_rtime.sec; 1168 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1169 fixpt_t lpctcpu; 1170 u_int lcpticks; 1171 1172 if (__predict_false((l->l_flag & LW_IDLE) != 0)) 1173 continue; 1174 lwp_lock(l); 1175 if (__predict_false(l->l_rtime.sec < 0) && 1176 !backwardslwp) { 1177 backwardslwp = true; 1178 printf("WARNING: lwp %ld (%s%s%s): " 1179 "negative runtime: " 1180 "(%jd + 0x%016"PRIx64"/2^64) sec\n", 1181 (long)l->l_lid, 1182 l->l_proc->p_comm, 1183 l->l_name ? " " : "", 1184 l->l_name ? l->l_name : "", 1185 (intmax_t)l->l_rtime.sec, 1186 l->l_rtime.frac); 1187 } 1188 runtm += l->l_rtime.sec; 1189 l->l_swtime++; 1190 sched_lwp_stats(l); 1191 1192 /* For load average calculation. */ 1193 if (__predict_false(lavg_count == 0) && 1194 (l->l_flag & (LW_SINTR | LW_SYSTEM)) == 0) { 1195 switch (l->l_stat) { 1196 case LSSLEEP: 1197 if (l->l_slptime > 1) { 1198 break; 1199 } 1200 /* FALLTHROUGH */ 1201 case LSRUN: 1202 case LSONPROC: 1203 case LSIDL: 1204 nrun++; 1205 } 1206 } 1207 lwp_unlock(l); 1208 1209 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT; 1210 if (l->l_slptime != 0) 1211 continue; 1212 1213 lpctcpu = l->l_pctcpu; 1214 lcpticks = atomic_swap_uint(&l->l_cpticks, 0); 1215 lpctcpu += ((FSCALE - ccpu) * 1216 (lcpticks * FSCALE / clkhz)) >> FSHIFT; 1217 l->l_pctcpu = lpctcpu; 1218 } 1219 /* Calculating p_pctcpu only for ps(1) */ 1220 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 1221 1222 if (__predict_false(runtm < 0)) { 1223 if (!backwardsproc) { 1224 backwardsproc = true; 1225 printf("WARNING: pid %ld (%s): " 1226 "negative runtime; " 1227 "monotonic clock has gone backwards\n", 1228 (long)p->p_pid, p->p_comm); 1229 } 1230 mutex_exit(p->p_lock); 1231 continue; 1232 } 1233 1234 /* 1235 * Check if the process exceeds its CPU resource allocation. 1236 * If over the hard limit, kill it with SIGKILL. 1237 * If over the soft limit, send SIGXCPU and raise 1238 * the soft limit a little. 1239 */ 1240 rlim = &p->p_rlimit[RLIMIT_CPU]; 1241 sig = 0; 1242 if (__predict_false(runtm >= rlim->rlim_cur)) { 1243 if (runtm >= rlim->rlim_max) { 1244 sig = SIGKILL; 1245 log(LOG_NOTICE, 1246 "pid %d, command %s, is killed: %s\n", 1247 p->p_pid, p->p_comm, "exceeded RLIMIT_CPU"); 1248 uprintf("pid %d, command %s, is killed: %s\n", 1249 p->p_pid, p->p_comm, "exceeded RLIMIT_CPU"); 1250 } else { 1251 sig = SIGXCPU; 1252 if (rlim->rlim_cur < rlim->rlim_max) 1253 rlim->rlim_cur += 5; 1254 } 1255 } 1256 mutex_exit(p->p_lock); 1257 if (__predict_false(sig)) { 1258 KASSERT((p->p_flag & PK_SYSTEM) == 0); 1259 psignal(p, sig); 1260 } 1261 } 1262 1263 /* Load average calculation. */ 1264 if (__predict_false(lavg_count == 0)) { 1265 int i; 1266 CTASSERT(__arraycount(cexp) == __arraycount(avg->ldavg)); 1267 for (i = 0; i < __arraycount(cexp); i++) { 1268 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + 1269 nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; 1270 } 1271 } 1272 1273 /* Lightning bolt. */ 1274 cv_broadcast(&lbolt); 1275 1276 mutex_exit(&proc_lock); 1277} 1278