1/* $NetBSD: kern_condvar.c,v 1.63 2023/11/02 10:31:55 martin Exp $ */ 2 3/*- 4 * Copyright (c) 2006, 2007, 2008, 2019, 2020, 2023 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33/* 34 * Kernel condition variable implementation. 35 */ 36 37#include <sys/cdefs.h> 38__KERNEL_RCSID(0, "$NetBSD: kern_condvar.c,v 1.63 2023/11/02 10:31:55 martin Exp $"); 39 40#include <sys/param.h> 41 42#include <sys/condvar.h> 43#include <sys/cpu.h> 44#include <sys/kernel.h> 45#include <sys/lockdebug.h> 46#include <sys/lwp.h> 47#include <sys/sleepq.h> 48#include <sys/syncobj.h> 49#include <sys/systm.h> 50 51/* 52 * Accessors for the private contents of the kcondvar_t data type. 53 * 54 * cv_opaque[0] sleepq_t 55 * cv_opaque[1] description for ps(1) 56 * 57 * cv_opaque[0] is protected by the interlock passed to cv_wait() (enqueue 58 * only), and the sleep queue lock acquired with sleepq_hashlock() (enqueue 59 * and dequeue). 60 * 61 * cv_opaque[1] (the wmesg) is static and does not change throughout the life 62 * of the CV. 63 */ 64#define CV_SLEEPQ(cv) ((sleepq_t *)(cv)->cv_opaque) 65#define CV_WMESG(cv) ((const char *)(cv)->cv_opaque[1]) 66#define CV_SET_WMESG(cv, v) (cv)->cv_opaque[1] = __UNCONST(v) 67 68#define CV_DEBUG_P(cv) (CV_WMESG(cv) != nodebug) 69#define CV_RA ((uintptr_t)__builtin_return_address(0)) 70 71static void cv_unsleep(lwp_t *, bool); 72static inline void cv_wakeup_one(kcondvar_t *); 73static inline void cv_wakeup_all(kcondvar_t *); 74 75syncobj_t cv_syncobj = { 76 .sobj_name = "cv", 77 .sobj_flag = SOBJ_SLEEPQ_SORTED, 78 .sobj_boostpri = PRI_KERNEL, 79 .sobj_unsleep = cv_unsleep, 80 .sobj_changepri = sleepq_changepri, 81 .sobj_lendpri = sleepq_lendpri, 82 .sobj_owner = syncobj_noowner, 83}; 84 85static const char deadcv[] = "deadcv"; 86 87/* 88 * cv_init: 89 * 90 * Initialize a condition variable for use. 91 */ 92void 93cv_init(kcondvar_t *cv, const char *wmesg) 94{ 95 96 KASSERT(wmesg != NULL); 97 CV_SET_WMESG(cv, wmesg); 98 sleepq_init(CV_SLEEPQ(cv)); 99} 100 101/* 102 * cv_destroy: 103 * 104 * Tear down a condition variable. 105 */ 106void 107cv_destroy(kcondvar_t *cv) 108{ 109 110 sleepq_destroy(CV_SLEEPQ(cv)); 111#ifdef DIAGNOSTIC 112 KASSERT(cv_is_valid(cv)); 113 KASSERT(!cv_has_waiters(cv)); 114 CV_SET_WMESG(cv, deadcv); 115#endif 116} 117 118/* 119 * cv_enter: 120 * 121 * Look up and lock the sleep queue corresponding to the given 122 * condition variable, and increment the number of waiters. 123 */ 124static inline int 125cv_enter(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l, bool catch_p) 126{ 127 sleepq_t *sq; 128 kmutex_t *mp; 129 int nlocks; 130 131 KASSERT(cv_is_valid(cv)); 132 KASSERT(!cpu_intr_p()); 133 KASSERT((l->l_pflag & LP_INTR) == 0 || panicstr != NULL); 134 135 mp = sleepq_hashlock(cv); 136 sq = CV_SLEEPQ(cv); 137 nlocks = sleepq_enter(sq, l, mp); 138 sleepq_enqueue(sq, cv, CV_WMESG(cv), &cv_syncobj, catch_p); 139 mutex_exit(mtx); 140 KASSERT(cv_has_waiters(cv)); 141 return nlocks; 142} 143 144/* 145 * cv_unsleep: 146 * 147 * Remove an LWP from the condition variable and sleep queue. This 148 * is called when the LWP has not been awoken normally but instead 149 * interrupted: for example, when a signal is received. Must be 150 * called with the LWP locked. Will unlock if "unlock" is true. 151 */ 152static void 153cv_unsleep(lwp_t *l, bool unlock) 154{ 155 kcondvar_t *cv __diagused; 156 157 cv = (kcondvar_t *)(uintptr_t)l->l_wchan; 158 159 KASSERT(l->l_wchan == (wchan_t)cv); 160 KASSERT(l->l_sleepq == CV_SLEEPQ(cv)); 161 KASSERT(cv_is_valid(cv)); 162 KASSERT(cv_has_waiters(cv)); 163 164 sleepq_unsleep(l, unlock); 165} 166 167/* 168 * cv_wait: 169 * 170 * Wait non-interruptably on a condition variable until awoken. 171 */ 172void 173cv_wait(kcondvar_t *cv, kmutex_t *mtx) 174{ 175 lwp_t *l = curlwp; 176 int nlocks; 177 178 KASSERT(mutex_owned(mtx)); 179 180 nlocks = cv_enter(cv, mtx, l, false); 181 (void)sleepq_block(0, false, &cv_syncobj, nlocks); 182 mutex_enter(mtx); 183} 184 185/* 186 * cv_wait_sig: 187 * 188 * Wait on a condition variable until a awoken or a signal is received. 189 * Will also return early if the process is exiting. Returns zero if 190 * awoken normally, ERESTART if a signal was received and the system 191 * call is restartable, or EINTR otherwise. 192 */ 193int 194cv_wait_sig(kcondvar_t *cv, kmutex_t *mtx) 195{ 196 lwp_t *l = curlwp; 197 int error, nlocks; 198 199 KASSERT(mutex_owned(mtx)); 200 201 nlocks = cv_enter(cv, mtx, l, true); 202 error = sleepq_block(0, true, &cv_syncobj, nlocks); 203 mutex_enter(mtx); 204 return error; 205} 206 207/* 208 * cv_timedwait: 209 * 210 * Wait on a condition variable until awoken or the specified timeout 211 * expires. Returns zero if awoken normally or EWOULDBLOCK if the 212 * timeout expired. 213 * 214 * timo is a timeout in ticks. timo = 0 specifies an infinite timeout. 215 */ 216int 217cv_timedwait(kcondvar_t *cv, kmutex_t *mtx, int timo) 218{ 219 lwp_t *l = curlwp; 220 int error, nlocks; 221 222 KASSERT(mutex_owned(mtx)); 223 224 nlocks = cv_enter(cv, mtx, l, false); 225 error = sleepq_block(timo, false, &cv_syncobj, nlocks); 226 mutex_enter(mtx); 227 return error; 228} 229 230/* 231 * cv_timedwait_sig: 232 * 233 * Wait on a condition variable until a timeout expires, awoken or a 234 * signal is received. Will also return early if the process is 235 * exiting. Returns zero if awoken normally, EWOULDBLOCK if the 236 * timeout expires, ERESTART if a signal was received and the system 237 * call is restartable, or EINTR otherwise. 238 * 239 * timo is a timeout in ticks. timo = 0 specifies an infinite timeout. 240 */ 241int 242cv_timedwait_sig(kcondvar_t *cv, kmutex_t *mtx, int timo) 243{ 244 lwp_t *l = curlwp; 245 int error, nlocks; 246 247 KASSERT(mutex_owned(mtx)); 248 249 nlocks = cv_enter(cv, mtx, l, true); 250 error = sleepq_block(timo, true, &cv_syncobj, nlocks); 251 mutex_enter(mtx); 252 return error; 253} 254 255/* 256 * Given a number of seconds, sec, and 2^64ths of a second, frac, we 257 * want a number of ticks for a timeout: 258 * 259 * timo = hz*(sec + frac/2^64) 260 * = hz*sec + hz*frac/2^64 261 * = hz*sec + hz*(frachi*2^32 + fraclo)/2^64 262 * = hz*sec + hz*frachi/2^32 + hz*fraclo/2^64, 263 * 264 * where frachi is the high 32 bits of frac and fraclo is the 265 * low 32 bits. 266 * 267 * We assume hz < INT_MAX/2 < UINT32_MAX, so 268 * 269 * hz*fraclo/2^64 < fraclo*2^32/2^64 <= 1, 270 * 271 * since fraclo < 2^32. 272 * 273 * We clamp the result at INT_MAX/2 for a timeout in ticks, since we 274 * can't represent timeouts higher than INT_MAX in cv_timedwait, and 275 * spurious wakeup is OK. Moreover, we don't want to wrap around, 276 * because we compute end - start in ticks in order to compute the 277 * remaining timeout, and that difference cannot wrap around, so we use 278 * a timeout less than INT_MAX. Using INT_MAX/2 provides plenty of 279 * margin for paranoia and will exceed most waits in practice by far. 280 */ 281static unsigned 282bintime2timo(const struct bintime *bt) 283{ 284 285 KASSERT(hz < INT_MAX/2); 286 CTASSERT(INT_MAX/2 < UINT32_MAX); 287 if (bt->sec > ((INT_MAX/2)/hz)) 288 return INT_MAX/2; 289 if ((hz*(bt->frac >> 32) >> 32) > (INT_MAX/2 - hz*bt->sec)) 290 return INT_MAX/2; 291 292 return hz*bt->sec + (hz*(bt->frac >> 32) >> 32); 293} 294 295/* 296 * timo is in units of ticks. We want units of seconds and 2^64ths of 297 * a second. We know hz = 1 sec/tick, and 2^64 = 1 sec/(2^64th of a 298 * second), from which we can conclude 2^64 / hz = 1 (2^64th of a 299 * second)/tick. So for the fractional part, we compute 300 * 301 * frac = rem * 2^64 / hz 302 * = ((rem * 2^32) / hz) * 2^32 303 * 304 * Using truncating integer division instead of real division will 305 * leave us with only about 32 bits of precision, which means about 306 * 1/4-nanosecond resolution, which is good enough for our purposes. 307 */ 308static struct bintime 309timo2bintime(unsigned timo) 310{ 311 312 return (struct bintime) { 313 .sec = timo / hz, 314 .frac = (((uint64_t)(timo % hz) << 32)/hz << 32), 315 }; 316} 317 318/* 319 * cv_timedwaitbt: 320 * 321 * Wait on a condition variable until awoken or the specified 322 * timeout expires. Returns zero if awoken normally or 323 * EWOULDBLOCK if the timeout expires. 324 * 325 * On entry, bt is a timeout in bintime. cv_timedwaitbt subtracts 326 * the time slept, so on exit, bt is the time remaining after 327 * sleeping, possibly negative if the complete time has elapsed. 328 * No infinite timeout; use cv_wait_sig instead. 329 * 330 * epsilon is a requested maximum error in timeout (excluding 331 * spurious wakeups). Currently not used, will be used in the 332 * future to choose between low- and high-resolution timers. 333 * Actual wakeup time will be somewhere in [t, t + max(e, r) + s) 334 * where r is the finest resolution of clock available and s is 335 * scheduling delays for scheduler overhead and competing threads. 336 * Time is measured by the interrupt source implementing the 337 * timeout, not by another timecounter. 338 */ 339int 340cv_timedwaitbt(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt, 341 const struct bintime *epsilon __diagused) 342{ 343 struct bintime slept; 344 unsigned start, end; 345 int timo; 346 int error; 347 348 KASSERTMSG(bt->sec >= 0, "negative timeout"); 349 KASSERTMSG(epsilon != NULL, "specify maximum requested delay"); 350 351 /* If there's nothing left to wait, time out. */ 352 if (bt->sec == 0 && bt->frac == 0) 353 return EWOULDBLOCK; 354 355 /* Convert to ticks, but clamp to be >=1. */ 356 timo = bintime2timo(bt); 357 KASSERTMSG(timo >= 0, "negative ticks: %d", timo); 358 if (timo == 0) 359 timo = 1; 360 361 /* 362 * getticks() is technically int, but nothing special 363 * happens instead of overflow, so we assume two's-complement 364 * wraparound and just treat it as unsigned. 365 */ 366 start = getticks(); 367 error = cv_timedwait(cv, mtx, timo); 368 end = getticks(); 369 370 /* 371 * Set it to the time left, or zero, whichever is larger. We 372 * do not fail with EWOULDBLOCK here because this may have been 373 * an explicit wakeup, so the caller needs to check before they 374 * give up or else cv_signal would be lost. 375 */ 376 slept = timo2bintime(end - start); 377 if (bintimecmp(bt, &slept, <=)) { 378 bt->sec = 0; 379 bt->frac = 0; 380 } else { 381 /* bt := bt - slept */ 382 bintime_sub(bt, &slept); 383 } 384 385 return error; 386} 387 388/* 389 * cv_timedwaitbt_sig: 390 * 391 * Wait on a condition variable until awoken, the specified 392 * timeout expires, or interrupted by a signal. Returns zero if 393 * awoken normally, EWOULDBLOCK if the timeout expires, or 394 * EINTR/ERESTART if interrupted by a signal. 395 * 396 * On entry, bt is a timeout in bintime. cv_timedwaitbt_sig 397 * subtracts the time slept, so on exit, bt is the time remaining 398 * after sleeping. No infinite timeout; use cv_wait instead. 399 * 400 * epsilon is a requested maximum error in timeout (excluding 401 * spurious wakeups). Currently not used, will be used in the 402 * future to choose between low- and high-resolution timers. 403 */ 404int 405cv_timedwaitbt_sig(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt, 406 const struct bintime *epsilon __diagused) 407{ 408 struct bintime slept; 409 unsigned start, end; 410 int timo; 411 int error; 412 413 KASSERTMSG(bt->sec >= 0, "negative timeout"); 414 KASSERTMSG(epsilon != NULL, "specify maximum requested delay"); 415 416 /* If there's nothing left to wait, time out. */ 417 if (bt->sec == 0 && bt->frac == 0) 418 return EWOULDBLOCK; 419 420 /* Convert to ticks, but clamp to be >=1. */ 421 timo = bintime2timo(bt); 422 KASSERTMSG(timo >= 0, "negative ticks: %d", timo); 423 if (timo == 0) 424 timo = 1; 425 426 /* 427 * getticks() is technically int, but nothing special 428 * happens instead of overflow, so we assume two's-complement 429 * wraparound and just treat it as unsigned. 430 */ 431 start = getticks(); 432 error = cv_timedwait_sig(cv, mtx, timo); 433 end = getticks(); 434 435 /* 436 * Set it to the time left, or zero, whichever is larger. We 437 * do not fail with EWOULDBLOCK here because this may have been 438 * an explicit wakeup, so the caller needs to check before they 439 * give up or else cv_signal would be lost. 440 */ 441 slept = timo2bintime(end - start); 442 if (bintimecmp(bt, &slept, <=)) { 443 bt->sec = 0; 444 bt->frac = 0; 445 } else { 446 /* bt := bt - slept */ 447 bintime_sub(bt, &slept); 448 } 449 450 return error; 451} 452 453/* 454 * cv_signal: 455 * 456 * Wake the highest priority LWP waiting on a condition variable. Must 457 * be called with the interlocking mutex held or just after it has been 458 * released (so the awoken LWP will see the changed condition). 459 */ 460void 461cv_signal(kcondvar_t *cv) 462{ 463 464 KASSERT(cv_is_valid(cv)); 465 466 if (__predict_false(!LIST_EMPTY(CV_SLEEPQ(cv)))) { 467 /* 468 * Compiler turns into a tail call usually, i.e. jmp, 469 * because the arguments are the same and no locals. 470 */ 471 cv_wakeup_one(cv); 472 } 473} 474 475/* 476 * cv_wakeup_one: 477 * 478 * Slow path for cv_signal(). Deliberately marked __noinline to 479 * prevent the compiler pulling it in to cv_signal(), which adds 480 * extra prologue and epilogue code. 481 */ 482static __noinline void 483cv_wakeup_one(kcondvar_t *cv) 484{ 485 sleepq_t *sq; 486 kmutex_t *mp; 487 lwp_t *l; 488 489 mp = sleepq_hashlock(cv); 490 sq = CV_SLEEPQ(cv); 491 if (__predict_true((l = LIST_FIRST(sq)) != NULL)) { 492 KASSERT(l->l_sleepq == sq); 493 KASSERT(l->l_mutex == mp); 494 KASSERT(l->l_wchan == cv); 495 sleepq_remove(sq, l, true); 496 } 497 mutex_spin_exit(mp); 498} 499 500/* 501 * cv_broadcast: 502 * 503 * Wake all LWPs waiting on a condition variable. Must be called with 504 * the interlocking mutex held or just after it has been released (so 505 * the awoken LWP will see the changed condition). 506 */ 507void 508cv_broadcast(kcondvar_t *cv) 509{ 510 511 KASSERT(cv_is_valid(cv)); 512 513 if (__predict_false(!LIST_EMPTY(CV_SLEEPQ(cv)))) { 514 /* 515 * Compiler turns into a tail call usually, i.e. jmp, 516 * because the arguments are the same and no locals. 517 */ 518 cv_wakeup_all(cv); 519 } 520} 521 522/* 523 * cv_wakeup_all: 524 * 525 * Slow path for cv_broadcast(). Deliberately marked __noinline to 526 * prevent the compiler pulling it in to cv_broadcast(), which adds 527 * extra prologue and epilogue code. 528 */ 529static __noinline void 530cv_wakeup_all(kcondvar_t *cv) 531{ 532 sleepq_t *sq; 533 kmutex_t *mp; 534 lwp_t *l; 535 536 mp = sleepq_hashlock(cv); 537 sq = CV_SLEEPQ(cv); 538 while ((l = LIST_FIRST(sq)) != NULL) { 539 KASSERT(l->l_sleepq == sq); 540 KASSERT(l->l_mutex == mp); 541 KASSERT(l->l_wchan == cv); 542 sleepq_remove(sq, l, true); 543 } 544 mutex_spin_exit(mp); 545} 546 547/* 548 * cv_has_waiters: 549 * 550 * For diagnostic assertions: return non-zero if a condition 551 * variable has waiters. 552 */ 553bool 554cv_has_waiters(kcondvar_t *cv) 555{ 556 557 return !LIST_EMPTY(CV_SLEEPQ(cv)); 558} 559 560/* 561 * cv_is_valid: 562 * 563 * For diagnostic assertions: return non-zero if a condition 564 * variable appears to be valid. No locks need be held. 565 */ 566bool 567cv_is_valid(kcondvar_t *cv) 568{ 569 570 return CV_WMESG(cv) != deadcv && CV_WMESG(cv) != NULL; 571} 572