1/* $NetBSD: kern_sleepq.c,v 1.87 2023/11/02 10:31:55 martin Exp $ */ 2 3/*- 4 * Copyright (c) 2006, 2007, 2008, 2009, 2019, 2020, 2023 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33/* 34 * Sleep queue implementation, used by turnstiles and general sleep/wakeup 35 * interfaces. 36 */ 37 38#include <sys/cdefs.h> 39__KERNEL_RCSID(0, "$NetBSD: kern_sleepq.c,v 1.87 2023/11/02 10:31:55 martin Exp $"); 40 41#include <sys/param.h> 42 43#include <sys/cpu.h> 44#include <sys/intr.h> 45#include <sys/kernel.h> 46#include <sys/ktrace.h> 47#include <sys/pool.h> 48#include <sys/proc.h> 49#include <sys/resourcevar.h> 50#include <sys/sched.h> 51#include <sys/sleepq.h> 52#include <sys/syncobj.h> 53#include <sys/systm.h> 54 55/* 56 * for sleepq_abort: 57 * During autoconfiguration or after a panic, a sleep will simply lower the 58 * priority briefly to allow interrupts, then return. The priority to be 59 * used (IPL_SAFEPRI) is machine-dependent, thus this value is initialized and 60 * maintained in the machine-dependent layers. This priority will typically 61 * be 0, or the lowest priority that is safe for use on the interrupt stack; 62 * it can be made higher to block network software interrupts after panics. 63 */ 64#ifndef IPL_SAFEPRI 65#define IPL_SAFEPRI 0 66#endif 67 68static int sleepq_sigtoerror(lwp_t *, int); 69 70/* General purpose sleep table, used by mtsleep() and condition variables. */ 71sleeptab_t sleeptab __cacheline_aligned; 72sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE] __cacheline_aligned; 73 74/* 75 * sleeptab_init: 76 * 77 * Initialize a sleep table. 78 */ 79void 80sleeptab_init(sleeptab_t *st) 81{ 82 static bool again; 83 int i; 84 85 for (i = 0; i < SLEEPTAB_HASH_SIZE; i++) { 86 if (!again) { 87 mutex_init(&sleepq_locks[i].lock, MUTEX_DEFAULT, 88 IPL_SCHED); 89 } 90 sleepq_init(&st->st_queue[i]); 91 } 92 again = true; 93} 94 95/* 96 * sleepq_init: 97 * 98 * Prepare a sleep queue for use. 99 */ 100void 101sleepq_init(sleepq_t *sq) 102{ 103 104 LIST_INIT(sq); 105} 106 107/* 108 * sleepq_remove: 109 * 110 * Remove an LWP from a sleep queue and wake it up. Distinguish 111 * between deliberate wakeups (which are a valuable information) and 112 * "unsleep" (an out-of-band action must be taken). 113 * 114 * For wakeup, convert any interruptable wait into non-interruptable 115 * one before waking the LWP. Otherwise, if only one LWP is awoken it 116 * could fail to do something useful with the wakeup due to an error 117 * return and the caller of e.g. cv_signal() may not expect this. 118 */ 119void 120sleepq_remove(sleepq_t *sq, lwp_t *l, bool wakeup) 121{ 122 struct schedstate_percpu *spc; 123 struct cpu_info *ci; 124 125 KASSERT(lwp_locked(l, NULL)); 126 127 if ((l->l_syncobj->sobj_flag & SOBJ_SLEEPQ_NULL) == 0) { 128 KASSERT(sq != NULL); 129 LIST_REMOVE(l, l_sleepchain); 130 } else { 131 KASSERT(sq == NULL); 132 } 133 134 l->l_syncobj = &sched_syncobj; 135 l->l_wchan = NULL; 136 l->l_sleepq = NULL; 137 l->l_flag &= wakeup ? ~(LW_SINTR|LW_CATCHINTR|LW_STIMO) : ~LW_SINTR; 138 139 ci = l->l_cpu; 140 spc = &ci->ci_schedstate; 141 142 /* 143 * If not sleeping, the LWP must have been suspended. Let whoever 144 * holds it stopped set it running again. 145 */ 146 if (l->l_stat != LSSLEEP) { 147 KASSERT(l->l_stat == LSSTOP || l->l_stat == LSSUSPENDED); 148 lwp_setlock(l, spc->spc_lwplock); 149 return; 150 } 151 152 /* 153 * If the LWP is still on the CPU, mark it as LSONPROC. It may be 154 * about to call mi_switch(), in which case it will yield. 155 */ 156 if ((l->l_pflag & LP_RUNNING) != 0) { 157 l->l_stat = LSONPROC; 158 l->l_slptime = 0; 159 lwp_setlock(l, spc->spc_lwplock); 160 return; 161 } 162 163 /* Update sleep time delta, call the wake-up handler of scheduler */ 164 l->l_slpticksum += (getticks() - l->l_slpticks); 165 sched_wakeup(l); 166 167 /* Look for a CPU to wake up */ 168 l->l_cpu = sched_takecpu(l); 169 ci = l->l_cpu; 170 spc = &ci->ci_schedstate; 171 172 /* 173 * Set it running. 174 */ 175 spc_lock(ci); 176 lwp_setlock(l, spc->spc_mutex); 177 sched_setrunnable(l); 178 l->l_stat = LSRUN; 179 l->l_slptime = 0; 180 sched_enqueue(l); 181 sched_resched_lwp(l, true); 182 /* LWP & SPC now unlocked, but we still hold sleep queue lock. */ 183} 184 185/* 186 * sleepq_insert: 187 * 188 * Insert an LWP into the sleep queue, optionally sorting by priority. 189 */ 190static void 191sleepq_insert(sleepq_t *sq, lwp_t *l, syncobj_t *sobj) 192{ 193 194 if ((sobj->sobj_flag & SOBJ_SLEEPQ_NULL) != 0) { 195 KASSERT(sq == NULL); 196 return; 197 } 198 KASSERT(sq != NULL); 199 200 if ((sobj->sobj_flag & SOBJ_SLEEPQ_SORTED) != 0) { 201 lwp_t *l2, *l_last = NULL; 202 const pri_t pri = lwp_eprio(l); 203 204 LIST_FOREACH(l2, sq, l_sleepchain) { 205 l_last = l2; 206 if (lwp_eprio(l2) < pri) { 207 LIST_INSERT_BEFORE(l2, l, l_sleepchain); 208 return; 209 } 210 } 211 /* 212 * Ensure FIFO ordering if no waiters are of lower priority. 213 */ 214 if (l_last != NULL) { 215 LIST_INSERT_AFTER(l_last, l, l_sleepchain); 216 return; 217 } 218 } 219 220 LIST_INSERT_HEAD(sq, l, l_sleepchain); 221} 222 223/* 224 * sleepq_enter: 225 * 226 * Prepare to block on a sleep queue, after which any interlock can be 227 * safely released. 228 */ 229int 230sleepq_enter(sleepq_t *sq, lwp_t *l, kmutex_t *mp) 231{ 232 int nlocks; 233 234 KASSERT((sq != NULL) == (mp != NULL)); 235 236 /* 237 * Acquire the per-LWP mutex and lend it our sleep queue lock. 238 * Once interlocked, we can release the kernel lock. 239 */ 240 lwp_lock(l); 241 if (mp != NULL) { 242 lwp_unlock_to(l, mp); 243 } 244 if (__predict_false((nlocks = l->l_blcnt) != 0)) { 245 KERNEL_UNLOCK_ALL(NULL, NULL); 246 } 247 return nlocks; 248} 249 250/* 251 * sleepq_enqueue: 252 * 253 * Enter an LWP into the sleep queue and prepare for sleep. The sleep 254 * queue must already be locked, and any interlock (such as the kernel 255 * lock) must have be released (see sleeptab_lookup(), sleepq_enter()). 256 */ 257void 258sleepq_enqueue(sleepq_t *sq, wchan_t wchan, const char *wmesg, syncobj_t *sobj, 259 bool catch_p) 260{ 261 lwp_t *l = curlwp; 262 263 KASSERT(lwp_locked(l, NULL)); 264 KASSERT(l->l_stat == LSONPROC); 265 KASSERT(l->l_wchan == NULL); 266 KASSERT(l->l_sleepq == NULL); 267 KASSERT((l->l_flag & LW_SINTR) == 0); 268 269 l->l_syncobj = sobj; 270 l->l_wchan = wchan; 271 l->l_sleepq = sq; 272 l->l_wmesg = wmesg; 273 l->l_slptime = 0; 274 l->l_stat = LSSLEEP; 275 if (catch_p) 276 l->l_flag |= LW_SINTR; 277 278 sleepq_insert(sq, l, sobj); 279 280 /* Save the time when thread has slept */ 281 l->l_slpticks = getticks(); 282 sched_slept(l); 283} 284 285/* 286 * sleepq_transfer: 287 * 288 * Move an LWP from one sleep queue to another. Both sleep queues 289 * must already be locked. 290 * 291 * The LWP will be updated with the new sleepq, wchan, wmesg, 292 * sobj, and mutex. The interruptible flag will also be updated. 293 */ 294void 295sleepq_transfer(lwp_t *l, sleepq_t *from_sq, sleepq_t *sq, wchan_t wchan, 296 const char *wmesg, syncobj_t *sobj, kmutex_t *mp, bool catch_p) 297{ 298 299 KASSERT(l->l_sleepq == from_sq); 300 301 LIST_REMOVE(l, l_sleepchain); 302 l->l_syncobj = sobj; 303 l->l_wchan = wchan; 304 l->l_sleepq = sq; 305 l->l_wmesg = wmesg; 306 307 if (catch_p) 308 l->l_flag = LW_SINTR | LW_CATCHINTR; 309 else 310 l->l_flag = ~(LW_SINTR | LW_CATCHINTR); 311 312 /* 313 * This allows the transfer from one sleepq to another where 314 * it is known that they're both protected by the same lock. 315 */ 316 if (mp != NULL) 317 lwp_setlock(l, mp); 318 319 sleepq_insert(sq, l, sobj); 320} 321 322/* 323 * sleepq_uncatch: 324 * 325 * Mark the LWP as no longer sleeping interruptibly. 326 */ 327void 328sleepq_uncatch(lwp_t *l) 329{ 330 331 l->l_flag &= ~(LW_SINTR | LW_CATCHINTR | LW_STIMO); 332} 333 334/* 335 * sleepq_block: 336 * 337 * After any intermediate step such as releasing an interlock, switch. 338 * sleepq_block() may return early under exceptional conditions, for 339 * example if the LWP's containing process is exiting. 340 * 341 * timo is a timeout in ticks. timo = 0 specifies an infinite timeout. 342 */ 343int 344sleepq_block(int timo, bool catch_p, syncobj_t *syncobj, int nlocks) 345{ 346 const int mask = LW_CANCELLED|LW_WEXIT|LW_WCORE|LW_PENDSIG; 347 int error = 0, sig, flag; 348 struct proc *p; 349 lwp_t *l = curlwp; 350 bool early = false; 351 352 ktrcsw(1, 0, syncobj); 353 354 /* 355 * If sleeping interruptably, check for pending signals, exits or 356 * core dump events. 357 * 358 * Note the usage of LW_CATCHINTR. This expresses our intent 359 * to catch or not catch sleep interruptions, which might change 360 * while we are sleeping. It is independent from LW_SINTR because 361 * we don't want to leave LW_SINTR set when the LWP is not asleep. 362 */ 363 if (catch_p) { 364 if ((l->l_flag & (LW_CANCELLED|LW_WEXIT|LW_WCORE)) != 0) { 365 l->l_flag &= ~LW_CANCELLED; 366 error = EINTR; 367 early = true; 368 } else if ((l->l_flag & LW_PENDSIG) != 0 && sigispending(l, 0)) 369 early = true; 370 l->l_flag |= LW_CATCHINTR; 371 } else 372 l->l_flag &= ~LW_CATCHINTR; 373 374 if (early) { 375 /* lwp_unsleep() will release the lock */ 376 lwp_unsleep(l, true); 377 } else { 378 /* 379 * The LWP may have already been awoken if the caller 380 * dropped the sleep queue lock between sleepq_enqueue() and 381 * sleepq_block(). If that happens l_stat will be LSONPROC 382 * and mi_switch() will treat this as a preemption. No need 383 * to do anything special here. 384 */ 385 if (timo) { 386 l->l_flag &= ~LW_STIMO; 387 callout_schedule(&l->l_timeout_ch, timo); 388 } 389 l->l_boostpri = l->l_syncobj->sobj_boostpri; 390 spc_lock(l->l_cpu); 391 mi_switch(l); 392 393 /* The LWP and sleep queue are now unlocked. */ 394 if (timo) { 395 /* 396 * Even if the callout appears to have fired, we 397 * need to stop it in order to synchronise with 398 * other CPUs. It's important that we do this in 399 * this LWP's context, and not during wakeup, in 400 * order to keep the callout & its cache lines 401 * co-located on the CPU with the LWP. 402 */ 403 (void)callout_halt(&l->l_timeout_ch, NULL); 404 error = (l->l_flag & LW_STIMO) ? EWOULDBLOCK : 0; 405 } 406 } 407 408 /* 409 * LW_CATCHINTR is only modified in this function OR when we 410 * are asleep (with the sleepq locked). We can therefore safely 411 * test it unlocked here as it is guaranteed to be stable by 412 * virtue of us running. 413 * 414 * We do not bother clearing it if set; that would require us 415 * to take the LWP lock, and it doesn't seem worth the hassle 416 * considering it is only meaningful here inside this function, 417 * and is set to reflect intent upon entry. 418 */ 419 flag = atomic_load_relaxed(&l->l_flag); 420 if (__predict_false((flag & mask) != 0)) { 421 if ((flag & LW_CATCHINTR) == 0 || error != 0) 422 /* nothing */; 423 else if ((flag & (LW_CANCELLED | LW_WEXIT | LW_WCORE)) != 0) 424 error = EINTR; 425 else if ((flag & LW_PENDSIG) != 0) { 426 /* 427 * Acquiring p_lock may cause us to recurse 428 * through the sleep path and back into this 429 * routine, but is safe because LWPs sleeping 430 * on locks are non-interruptable and we will 431 * not recurse again. 432 */ 433 p = l->l_proc; 434 mutex_enter(p->p_lock); 435 if (((sig = sigispending(l, 0)) != 0 && 436 (sigprop[sig] & SA_STOP) == 0) || 437 (sig = issignal(l)) != 0) 438 error = sleepq_sigtoerror(l, sig); 439 mutex_exit(p->p_lock); 440 } 441 } 442 443 ktrcsw(0, 0, syncobj); 444 if (__predict_false(nlocks != 0)) { 445 KERNEL_LOCK(nlocks, NULL); 446 } 447 return error; 448} 449 450/* 451 * sleepq_wake: 452 * 453 * Wake zero or more LWPs blocked on a single wait channel. 454 */ 455void 456sleepq_wake(sleepq_t *sq, wchan_t wchan, u_int expected, kmutex_t *mp) 457{ 458 lwp_t *l, *next; 459 460 KASSERT(mutex_owned(mp)); 461 462 for (l = LIST_FIRST(sq); l != NULL; l = next) { 463 KASSERT(l->l_sleepq == sq); 464 KASSERT(l->l_mutex == mp); 465 next = LIST_NEXT(l, l_sleepchain); 466 if (l->l_wchan != wchan) 467 continue; 468 sleepq_remove(sq, l, true); 469 if (--expected == 0) 470 break; 471 } 472 473 mutex_spin_exit(mp); 474} 475 476/* 477 * sleepq_unsleep: 478 * 479 * Remove an LWP from its sleep queue and set it runnable again. 480 * sleepq_unsleep() is called with the LWP's mutex held, and will 481 * release it if "unlock" is true. 482 */ 483void 484sleepq_unsleep(lwp_t *l, bool unlock) 485{ 486 sleepq_t *sq = l->l_sleepq; 487 kmutex_t *mp = l->l_mutex; 488 489 KASSERT(lwp_locked(l, mp)); 490 KASSERT(l->l_wchan != NULL); 491 492 sleepq_remove(sq, l, false); 493 if (unlock) { 494 mutex_spin_exit(mp); 495 } 496} 497 498/* 499 * sleepq_timeout: 500 * 501 * Entered via the callout(9) subsystem to time out an LWP that is on a 502 * sleep queue. 503 */ 504void 505sleepq_timeout(void *arg) 506{ 507 lwp_t *l = arg; 508 509 /* 510 * Lock the LWP. Assuming it's still on the sleep queue, its 511 * current mutex will also be the sleep queue mutex. 512 */ 513 lwp_lock(l); 514 515 if (l->l_wchan == NULL || l->l_syncobj == &callout_syncobj) { 516 /* 517 * Somebody beat us to it, or the LWP is blocked in 518 * callout_halt() waiting for us to finish here. In 519 * neither case should the LWP produce EWOULDBLOCK. 520 */ 521 lwp_unlock(l); 522 return; 523 } 524 525 l->l_flag |= LW_STIMO; 526 lwp_unsleep(l, true); 527} 528 529/* 530 * sleepq_sigtoerror: 531 * 532 * Given a signal number, interpret and return an error code. 533 */ 534static int 535sleepq_sigtoerror(lwp_t *l, int sig) 536{ 537 struct proc *p = l->l_proc; 538 int error; 539 540 KASSERT(mutex_owned(p->p_lock)); 541 542 /* 543 * If this sleep was canceled, don't let the syscall restart. 544 */ 545 if ((SIGACTION(p, sig).sa_flags & SA_RESTART) == 0) 546 error = EINTR; 547 else 548 error = ERESTART; 549 550 return error; 551} 552 553/* 554 * sleepq_abort: 555 * 556 * After a panic or during autoconfiguration, lower the interrupt 557 * priority level to give pending interrupts a chance to run, and 558 * then return. Called if sleepq_dontsleep() returns non-zero, and 559 * always returns zero. 560 */ 561int 562sleepq_abort(kmutex_t *mtx, int unlock) 563{ 564 int s; 565 566 s = splhigh(); 567 splx(IPL_SAFEPRI); 568 splx(s); 569 if (mtx != NULL && unlock != 0) 570 mutex_exit(mtx); 571 572 return 0; 573} 574 575/* 576 * sleepq_reinsert: 577 * 578 * Move the position of the lwp in the sleep queue after a possible 579 * change of the lwp's effective priority. 580 */ 581static void 582sleepq_reinsert(sleepq_t *sq, lwp_t *l) 583{ 584 585 KASSERT(l->l_sleepq == sq); 586 if ((l->l_syncobj->sobj_flag & SOBJ_SLEEPQ_SORTED) == 0) { 587 return; 588 } 589 590 /* 591 * Don't let the sleep queue become empty, even briefly. 592 * cv_signal() and cv_broadcast() inspect it without the 593 * sleep queue lock held and need to see a non-empty queue 594 * head if there are waiters. 595 */ 596 if (LIST_FIRST(sq) == l && LIST_NEXT(l, l_sleepchain) == NULL) { 597 return; 598 } 599 LIST_REMOVE(l, l_sleepchain); 600 sleepq_insert(sq, l, l->l_syncobj); 601} 602 603/* 604 * sleepq_changepri: 605 * 606 * Adjust the priority of an LWP residing on a sleepq. 607 */ 608void 609sleepq_changepri(lwp_t *l, pri_t pri) 610{ 611 sleepq_t *sq = l->l_sleepq; 612 613 KASSERT(lwp_locked(l, NULL)); 614 615 l->l_priority = pri; 616 sleepq_reinsert(sq, l); 617} 618 619/* 620 * sleepq_changepri: 621 * 622 * Adjust the lended priority of an LWP residing on a sleepq. 623 */ 624void 625sleepq_lendpri(lwp_t *l, pri_t pri) 626{ 627 sleepq_t *sq = l->l_sleepq; 628 629 KASSERT(lwp_locked(l, NULL)); 630 631 l->l_inheritedprio = pri; 632 l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio); 633 sleepq_reinsert(sq, l); 634} 635