1/* $NetBSD: scheduler.c,v 1.55 2023/10/05 19:41:07 ad Exp $ */ 2 3/* 4 * Copyright (c) 2010, 2011 Antti Kantee. All Rights Reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28#include <sys/cdefs.h> 29__KERNEL_RCSID(0, "$NetBSD: scheduler.c,v 1.55 2023/10/05 19:41:07 ad Exp $"); 30 31#include <sys/param.h> 32#include <sys/atomic.h> 33#include <sys/cpu.h> 34#include <sys/kmem.h> 35#include <sys/mutex.h> 36#include <sys/namei.h> 37#include <sys/queue.h> 38#include <sys/select.h> 39#include <sys/systm.h> 40 41#include <rump-sys/kern.h> 42 43#include <rump/rumpuser.h> 44 45static struct rumpcpu { 46 /* needed in fastpath */ 47 struct cpu_info *rcpu_ci; 48 void *rcpu_prevlwp; 49 50 /* needed in slowpath */ 51 struct rumpuser_mtx *rcpu_mtx; 52 struct rumpuser_cv *rcpu_cv; 53 int rcpu_wanted; 54 55 /* offset 20 (P=4) or 36 (P=8) here */ 56 57 /* 58 * Some stats. Not really that necessary, but we should 59 * have room. Note that these overflow quite fast, so need 60 * to be collected often. 61 */ 62 unsigned int rcpu_fastpath; 63 unsigned int rcpu_slowpath; 64 unsigned int rcpu_migrated; 65 66 /* offset 32 (P=4) or 50 (P=8) */ 67 68 int rcpu_align[0] __aligned(CACHE_LINE_SIZE); 69} rcpu_storage[MAXCPUS]; 70 71static inline struct rumpcpu * 72cpuinfo_to_rumpcpu(struct cpu_info *ci) 73{ 74 75 return &rcpu_storage[cpu_index(ci)]; 76} 77 78struct cpu_info rump_bootcpu; 79 80#define RCPULWP_BUSY ((void *)-1) 81#define RCPULWP_WANTED ((void *)-2) 82 83static struct rumpuser_mtx *lwp0mtx; 84static struct rumpuser_cv *lwp0cv; 85static unsigned nextcpu; 86 87kmutex_t unruntime_lock; /* unruntime lwp lock. practically unused */ 88 89static bool lwp0isbusy = false; 90 91/* 92 * Keep some stats. 93 * 94 * Keeping track of there is not really critical for speed, unless 95 * stats happen to be on a different cache line (CACHE_LINE_SIZE is 96 * really just a coarse estimate), so default for the performant case 97 * (i.e. no stats). 98 */ 99#ifdef RUMPSCHED_STATS 100#define SCHED_FASTPATH(rcpu) rcpu->rcpu_fastpath++; 101#define SCHED_SLOWPATH(rcpu) rcpu->rcpu_slowpath++; 102#define SCHED_MIGRATED(rcpu) rcpu->rcpu_migrated++; 103#else 104#define SCHED_FASTPATH(rcpu) 105#define SCHED_SLOWPATH(rcpu) 106#define SCHED_MIGRATED(rcpu) 107#endif 108 109struct cpu_info * 110cpu_lookup(u_int index) 111{ 112 113 return rcpu_storage[index].rcpu_ci; 114} 115 116static inline struct rumpcpu * 117getnextcpu(void) 118{ 119 unsigned newcpu; 120 121 newcpu = atomic_inc_uint_nv(&nextcpu); 122 if (__predict_false(ncpu > UINT_MAX/2)) 123 atomic_and_uint(&nextcpu, 0); 124 newcpu = newcpu % ncpu; 125 126 return &rcpu_storage[newcpu]; 127} 128 129/* this could/should be mi_attach_cpu? */ 130void 131rump_cpus_bootstrap(int *nump) 132{ 133 int num = *nump; 134 135 if (num > MAXCPUS) { 136 aprint_verbose("CPU limit: %d wanted, %d (MAXCPUS) " 137 "available (adjusted)\n", num, MAXCPUS); 138 num = MAXCPUS; 139 } 140 141 cpu_setmodel("rumpcore (virtual)"); 142 143 mi_cpu_init(); 144 145 /* attach first cpu for bootstrap */ 146 rump_cpu_attach(&rump_bootcpu); 147 ncpu = 1; 148 *nump = num; 149} 150 151void 152rump_scheduler_init(int numcpu) 153{ 154 struct rumpcpu *rcpu; 155 struct cpu_info *ci; 156 int i; 157 158 rumpuser_mutex_init(&lwp0mtx, RUMPUSER_MTX_SPIN); 159 rumpuser_cv_init(&lwp0cv); 160 for (i = 0; i < numcpu; i++) { 161 if (i == 0) { 162 ci = &rump_bootcpu; 163 } else { 164 ci = kmem_zalloc(sizeof(*ci), KM_SLEEP); 165 ci->ci_index = i; 166 } 167 168 rcpu = &rcpu_storage[i]; 169 rcpu->rcpu_ci = ci; 170 rcpu->rcpu_wanted = 0; 171 rumpuser_cv_init(&rcpu->rcpu_cv); 172 rumpuser_mutex_init(&rcpu->rcpu_mtx, RUMPUSER_MTX_SPIN); 173 174 ci->ci_schedstate.spc_mutex = 175 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 176 ci->ci_schedstate.spc_flags = SPCF_RUNNING; 177 } 178 179 mutex_init(&unruntime_lock, MUTEX_DEFAULT, IPL_SCHED); 180} 181 182void 183rump_schedlock_cv_signal(struct cpu_info *ci, struct rumpuser_cv *cv) 184{ 185 struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(ci); 186 187 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); 188 rumpuser_cv_signal(cv); 189 rumpuser_mutex_exit(rcpu->rcpu_mtx); 190} 191 192/* 193 * condvar ops using scheduler lock as the rumpuser interlock. 194 */ 195void 196rump_schedlock_cv_wait(struct rumpuser_cv *cv) 197{ 198 struct lwp *l = curlwp; 199 struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu); 200 201 /* mutex will be taken and released in cpu schedule/unschedule */ 202 rumpuser_cv_wait(cv, rcpu->rcpu_mtx); 203} 204 205int 206rump_schedlock_cv_timedwait(struct rumpuser_cv *cv, const struct timespec *ts) 207{ 208 struct lwp *l = curlwp; 209 struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu); 210 211 /* mutex will be taken and released in cpu schedule/unschedule */ 212 return rumpuser_cv_timedwait(cv, rcpu->rcpu_mtx, 213 ts->tv_sec, ts->tv_nsec); 214} 215 216static void 217lwp0busy(void) 218{ 219 220 /* busy lwp0 */ 221 KASSERT(curlwp == NULL || curlwp->l_stat != LSONPROC); 222 rumpuser_mutex_enter_nowrap(lwp0mtx); 223 while (lwp0isbusy) 224 rumpuser_cv_wait_nowrap(lwp0cv, lwp0mtx); 225 lwp0isbusy = true; 226 rumpuser_mutex_exit(lwp0mtx); 227} 228 229static void 230lwp0rele(void) 231{ 232 233 rumpuser_mutex_enter_nowrap(lwp0mtx); 234 KASSERT(lwp0isbusy == true); 235 lwp0isbusy = false; 236 rumpuser_cv_signal(lwp0cv); 237 rumpuser_mutex_exit(lwp0mtx); 238} 239 240/* 241 * rump_schedule: ensure that the calling host thread has a valid lwp context. 242 * ie. ensure that curlwp != NULL. Also, ensure that there 243 * a 1:1 mapping between the lwp and rump kernel cpu. 244 */ 245void 246rump_schedule() 247{ 248 struct lwp *l; 249 250 /* 251 * If there is no dedicated lwp, allocate a temp one and 252 * set it to be free'd upon unschedule(). Use lwp0 context 253 * for reserving the necessary resources. Don't optimize 254 * for this case -- anyone who cares about performance will 255 * start a real thread. 256 */ 257 if (__predict_true((l = curlwp) != NULL)) { 258 struct proc *p = l->l_proc; 259 rump_schedule_cpu(l); 260 if (l->l_cred != p->p_cred) { 261 kauth_cred_t oc = l->l_cred; 262 mutex_enter(p->p_lock); 263 l->l_cred = kauth_cred_hold(p->p_cred); 264 mutex_exit(p->p_lock); 265 kauth_cred_free(oc); 266 } 267 } else { 268 lwp0busy(); 269 270 /* schedule cpu and use lwp0 */ 271 rump_schedule_cpu(&lwp0); 272 rump_lwproc_curlwp_set(&lwp0); 273 274 /* allocate thread, switch to it, and release lwp0 */ 275 l = rump__lwproc_alloclwp(initproc); 276 rump_lwproc_switch(l); 277 lwp0rele(); 278 279 /* 280 * mark new thread dead-on-unschedule. this 281 * means that we'll be running with l_refcnt == 0. 282 * relax, it's fine. 283 */ 284 rump_lwproc_releaselwp(); 285 } 286} 287 288void 289rump_schedule_cpu(struct lwp *l) 290{ 291 292 rump_schedule_cpu_interlock(l, NULL); 293} 294 295/* 296 * Schedule a CPU. This optimizes for the case where we schedule 297 * the same thread often, and we have nCPU >= nFrequently-Running-Thread 298 * (where CPU is virtual rump cpu, not host CPU). 299 */ 300void 301rump_schedule_cpu_interlock(struct lwp *l, void *interlock) 302{ 303 struct rumpcpu *rcpu; 304 struct cpu_info *ci; 305 void *old; 306 bool domigrate; 307 bool bound = l->l_pflag & LP_BOUND; 308 309 l->l_stat = LSRUN; 310 311 /* 312 * First, try fastpath: if we were the previous user of the 313 * CPU, everything is in order cachewise and we can just 314 * proceed to use it. 315 * 316 * If we are a different thread (i.e. CAS fails), we must go 317 * through a memory barrier to ensure we get a truthful 318 * view of the world. 319 */ 320 321 KASSERT(l->l_target_cpu != NULL); 322 rcpu = cpuinfo_to_rumpcpu(l->l_target_cpu); 323 if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, l, RCPULWP_BUSY) == l) { 324 if (interlock == rcpu->rcpu_mtx) 325 rumpuser_mutex_exit(rcpu->rcpu_mtx); 326 SCHED_FASTPATH(rcpu); 327 /* jones, you're the man */ 328 goto fastlane; 329 } 330 331 /* 332 * Else, it's the slowpath for us. First, determine if we 333 * can migrate. 334 */ 335 if (ncpu == 1) 336 domigrate = false; 337 else 338 domigrate = true; 339 340 /* Take lock. This acts as a load barrier too. */ 341 if (interlock != rcpu->rcpu_mtx) 342 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); 343 344 for (;;) { 345 SCHED_SLOWPATH(rcpu); 346 old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, RCPULWP_WANTED); 347 348 /* CPU is free? */ 349 if (old != RCPULWP_BUSY && old != RCPULWP_WANTED) { 350 if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, 351 RCPULWP_WANTED, RCPULWP_BUSY) == RCPULWP_WANTED) { 352 break; 353 } 354 } 355 356 /* 357 * Do we want to migrate once? 358 * This may need a slightly better algorithm, or we 359 * might cache pingpong eternally for non-frequent 360 * threads. 361 */ 362 if (domigrate && !bound) { 363 domigrate = false; 364 SCHED_MIGRATED(rcpu); 365 rumpuser_mutex_exit(rcpu->rcpu_mtx); 366 rcpu = getnextcpu(); 367 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); 368 continue; 369 } 370 371 /* Want CPU, wait until it's released an retry */ 372 rcpu->rcpu_wanted++; 373 rumpuser_cv_wait_nowrap(rcpu->rcpu_cv, rcpu->rcpu_mtx); 374 rcpu->rcpu_wanted--; 375 } 376 rumpuser_mutex_exit(rcpu->rcpu_mtx); 377 378 fastlane: 379 ci = rcpu->rcpu_ci; 380 l->l_cpu = l->l_target_cpu = ci; 381 l->l_mutex = rcpu->rcpu_ci->ci_schedstate.spc_mutex; 382 l->l_ru.ru_nvcsw++; 383 l->l_stat = LSONPROC; 384 385 /* 386 * No interrupts, so ci_curlwp === cpu_onproc. 387 * Okay, we could make an attempt to not set cpu_onproc 388 * in the case that an interrupt is scheduled immediately 389 * after a user proc, but leave that for later. 390 */ 391 ci->ci_curlwp = ci->ci_onproc = l; 392} 393 394void 395rump_unschedule() 396{ 397 struct lwp *l = curlwp; 398#ifdef DIAGNOSTIC 399 int nlock; 400 401 KERNEL_UNLOCK_ALL(l, &nlock); 402 KASSERT(nlock == 0); 403#endif 404 405 KASSERT(l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex); 406 rump_unschedule_cpu(l); 407 l->l_mutex = &unruntime_lock; 408 l->l_stat = LSSTOP; 409 410 /* 411 * Check special conditions: 412 * 1) do we need to free the lwp which just unscheduled? 413 * (locking order: lwp0, cpu) 414 * 2) do we want to clear curlwp for the current host thread 415 */ 416 if (__predict_false(l->l_flag & LW_WEXIT)) { 417 lwp0busy(); 418 419 /* Now that we have lwp0, we can schedule a CPU again */ 420 rump_schedule_cpu(l); 421 422 /* switch to lwp0. this frees the old thread */ 423 KASSERT(l->l_flag & LW_WEXIT); 424 rump_lwproc_switch(&lwp0); 425 426 /* release lwp0 */ 427 rump_unschedule_cpu(&lwp0); 428 lwp0.l_mutex = &unruntime_lock; 429 lwp0.l_pflag &= ~LP_RUNNING; 430 lwp0rele(); 431 rump_lwproc_curlwp_clear(&lwp0); 432 433 } else if (__predict_false(l->l_flag & LW_RUMP_CLEAR)) { 434 rump_lwproc_curlwp_clear(l); 435 l->l_flag &= ~LW_RUMP_CLEAR; 436 } 437} 438 439void 440rump_unschedule_cpu(struct lwp *l) 441{ 442 443 rump_unschedule_cpu_interlock(l, NULL); 444} 445 446void 447rump_unschedule_cpu_interlock(struct lwp *l, void *interlock) 448{ 449 450 if ((l->l_pflag & LP_INTR) == 0) 451 rump_softint_run(l->l_cpu); 452 rump_unschedule_cpu1(l, interlock); 453} 454 455void 456rump_unschedule_cpu1(struct lwp *l, void *interlock) 457{ 458 struct rumpcpu *rcpu; 459 struct cpu_info *ci; 460 void *old; 461 462 ci = l->l_cpu; 463 ci->ci_curlwp = ci->ci_onproc = NULL; 464 rcpu = cpuinfo_to_rumpcpu(ci); 465 466 KASSERT(rcpu->rcpu_ci == ci); 467 468 /* 469 * Make sure all stores are seen before the CPU release. This 470 * is relevant only in the non-fastpath scheduling case, but 471 * we don't know here if that's going to happen, so need to 472 * expect the worst. 473 * 474 * If the scheduler interlock was requested by the caller, we 475 * need to obtain it before we release the CPU. Otherwise, we risk a 476 * race condition where another thread is scheduled onto the 477 * rump kernel CPU before our current thread can 478 * grab the interlock. 479 */ 480 if (interlock == rcpu->rcpu_mtx) 481 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); 482 else 483 membar_release(); /* XXX what does this pair with? */ 484 485 /* Release the CPU. */ 486 old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, l); 487 488 /* No waiters? No problems. We're outta here. */ 489 if (old == RCPULWP_BUSY) { 490 return; 491 } 492 493 KASSERT(old == RCPULWP_WANTED); 494 495 /* 496 * Ok, things weren't so snappy. 497 * 498 * Snailpath: take lock and signal anyone waiting for this CPU. 499 */ 500 501 if (interlock != rcpu->rcpu_mtx) 502 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); 503 if (rcpu->rcpu_wanted) 504 rumpuser_cv_broadcast(rcpu->rcpu_cv); 505 if (interlock != rcpu->rcpu_mtx) 506 rumpuser_mutex_exit(rcpu->rcpu_mtx); 507} 508 509/* Give up and retake CPU (perhaps a different one) */ 510void 511yield() 512{ 513 struct lwp *l = curlwp; 514 int nlocks; 515 516 KERNEL_UNLOCK_ALL(l, &nlocks); 517 rump_unschedule_cpu(l); 518 rump_schedule_cpu(l); 519 KERNEL_LOCK(nlocks, l); 520} 521 522void 523preempt() 524{ 525 526 yield(); 527} 528 529bool 530kpreempt(uintptr_t where) 531{ 532 533 return false; 534} 535 536/* 537 * There is no kernel thread preemption in rump currently. But call 538 * the implementing macros anyway in case they grow some side-effects 539 * down the road. 540 */ 541void 542kpreempt_disable(void) 543{ 544 545 KPREEMPT_DISABLE(curlwp); 546} 547 548void 549kpreempt_enable(void) 550{ 551 552 KPREEMPT_ENABLE(curlwp); 553} 554 555bool 556kpreempt_disabled(void) 557{ 558#if 0 559 const lwp_t *l = curlwp; 560 561 return l->l_nopreempt != 0 || l->l_stat == LSZOMB || 562 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled(); 563#endif 564 /* XXX: emulate cpu_kpreempt_disabled() */ 565 return true; 566} 567 568void 569suspendsched(void) 570{ 571 572 /* 573 * Could wait until everyone is out and block further entries, 574 * but skip that for now. 575 */ 576} 577 578void 579sched_nice(struct proc *p, int level) 580{ 581 582 /* nothing to do for now */ 583} 584 585void 586setrunnable(struct lwp *l) 587{ 588 589 sched_enqueue(l); 590} 591 592void 593sched_enqueue(struct lwp *l) 594{ 595 596 rump_thread_allow(l); 597} 598 599void 600sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock) 601{ 602 603} 604 605void 606sched_resched_lwp(struct lwp *l, bool unlock) 607{ 608 609} 610 611void 612sched_dequeue(struct lwp *l) 613{ 614 615 panic("sched_dequeue not implemented"); 616} 617 618void 619preempt_point(void) 620{ 621 622} 623 624bool 625preempt_needed(void) 626{ 627 628 return false; 629} 630