1/* $NetBSD: kern_runq.c,v 1.70 2023/09/19 22:15:32 ad Exp $ */ 2 3/*- 4 * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32/* 33 * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org> 34 * All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 45 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 48 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 55 * SUCH DAMAGE. 56 */ 57 58#include <sys/cdefs.h> 59__KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.70 2023/09/19 22:15:32 ad Exp $"); 60 61#include "opt_dtrace.h" 62 63#include <sys/param.h> 64#include <sys/kernel.h> 65#include <sys/bitops.h> 66#include <sys/cpu.h> 67#include <sys/idle.h> 68#include <sys/intr.h> 69#include <sys/kmem.h> 70#include <sys/lwp.h> 71#include <sys/mutex.h> 72#include <sys/proc.h> 73#include <sys/pset.h> 74#include <sys/sched.h> 75#include <sys/syscallargs.h> 76#include <sys/sysctl.h> 77#include <sys/systm.h> 78#include <sys/types.h> 79#include <sys/evcnt.h> 80#include <sys/atomic.h> 81 82/* 83 * Bits per map. 84 */ 85#define BITMAP_BITS (32) 86#define BITMAP_SHIFT (5) 87#define BITMAP_MSB (0x80000000U) 88#define BITMAP_MASK (BITMAP_BITS - 1) 89 90const int schedppq = 1; 91 92static void *sched_getrq(struct schedstate_percpu *, const pri_t); 93#ifdef MULTIPROCESSOR 94static lwp_t * sched_catchlwp(struct cpu_info *); 95#endif 96 97/* 98 * Preemption control. 99 */ 100#ifdef __HAVE_PREEMPTION 101# ifdef DEBUG 102int sched_kpreempt_pri = 0; 103# else 104int sched_kpreempt_pri = PRI_USER_RT; 105# endif 106#else 107int sched_kpreempt_pri = 1000; 108#endif 109 110/* 111 * Migration and balancing. 112 */ 113static u_int cacheht_time; /* Cache hotness time */ 114static u_int min_catch; /* Minimal LWP count for catching */ 115static u_int skim_interval; /* Rate limit for stealing LWPs */ 116 117#ifdef KDTRACE_HOOKS 118struct lwp *curthread; 119#endif 120 121void 122runq_init(void) 123{ 124 125 /* Pulling from remote packages, LWP must not have run for 10ms. */ 126 cacheht_time = 10; 127 128 /* Minimal count of LWPs for catching */ 129 min_catch = 1; 130 131 /* Steal from other CPUs at most every 10ms. */ 132 skim_interval = 10; 133} 134 135void 136sched_cpuattach(struct cpu_info *ci) 137{ 138 struct schedstate_percpu *spc; 139 size_t size; 140 void *p; 141 u_int i; 142 143 spc = &ci->ci_schedstate; 144 spc->spc_nextpkg = ci; 145 146 if (spc->spc_lwplock == NULL) { 147 spc->spc_lwplock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 148 } 149 if (ci == lwp0.l_cpu) { 150 /* Initialize the scheduler structure of the primary LWP */ 151 lwp0.l_mutex = spc->spc_lwplock; 152 } 153 if (spc->spc_mutex != NULL) { 154 /* Already initialized. */ 155 return; 156 } 157 158 /* Allocate the run queue */ 159 size = roundup2(sizeof(spc->spc_queue[0]) * PRI_COUNT, coherency_unit) + 160 coherency_unit; 161 p = kmem_alloc(size, KM_SLEEP); 162 spc->spc_queue = (void *)roundup2((uintptr_t)p, coherency_unit); 163 164 /* Initialize run queues */ 165 spc->spc_mutex = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 166 for (i = 0; i < PRI_COUNT; i++) 167 TAILQ_INIT(&spc->spc_queue[i]); 168} 169 170/* 171 * Control of the runqueue. 172 */ 173static inline void * 174sched_getrq(struct schedstate_percpu *spc, const pri_t prio) 175{ 176 177 KASSERT(prio < PRI_COUNT); 178 return &spc->spc_queue[prio]; 179} 180 181/* 182 * Put an LWP onto a run queue. The LWP must be locked by spc_mutex for 183 * l_cpu. 184 */ 185void 186sched_enqueue(struct lwp *l) 187{ 188 struct schedstate_percpu *spc; 189 TAILQ_HEAD(, lwp) *q_head; 190 const pri_t eprio = lwp_eprio(l); 191 struct cpu_info *ci; 192 193 ci = l->l_cpu; 194 spc = &ci->ci_schedstate; 195 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 196 197 /* Enqueue the thread */ 198 q_head = sched_getrq(spc, eprio); 199 if (TAILQ_EMPTY(q_head)) { 200 u_int i; 201 uint32_t q; 202 203 /* Mark bit */ 204 i = eprio >> BITMAP_SHIFT; 205 q = BITMAP_MSB >> (eprio & BITMAP_MASK); 206 KASSERT((spc->spc_bitmap[i] & q) == 0); 207 spc->spc_bitmap[i] |= q; 208 } 209 210 /* 211 * Determine run queue position according to POSIX. XXX Explicitly 212 * lowering a thread's priority with pthread_setschedparam() is not 213 * handled. 214 */ 215 if ((l->l_pflag & LP_PREEMPTING) != 0) { 216 switch (l->l_class) { 217 case SCHED_OTHER: 218 TAILQ_INSERT_TAIL(q_head, l, l_runq); 219 break; 220 case SCHED_FIFO: 221 TAILQ_INSERT_HEAD(q_head, l, l_runq); 222 break; 223 case SCHED_RR: 224 if (getticks() - l->l_rticks >= sched_rrticks) { 225 TAILQ_INSERT_TAIL(q_head, l, l_runq); 226 } else { 227 TAILQ_INSERT_HEAD(q_head, l, l_runq); 228 } 229 break; 230 default: 231 panic("sched_enqueue: LWP %p has class %d\n", 232 l, l->l_class); 233 } 234 } else { 235 TAILQ_INSERT_TAIL(q_head, l, l_runq); 236 } 237 spc->spc_flags &= ~SPCF_IDLE; 238 spc->spc_count++; 239 if ((l->l_pflag & LP_BOUND) == 0) { 240 atomic_store_relaxed(&spc->spc_mcount, 241 atomic_load_relaxed(&spc->spc_mcount) + 1); 242 } 243 244 /* 245 * Update the value of highest priority in the runqueue, 246 * if priority of this thread is higher. 247 */ 248 if (eprio > spc->spc_maxpriority) 249 spc->spc_maxpriority = eprio; 250 251 sched_newts(l); 252} 253 254/* 255 * Remove and LWP from the run queue it's on. The LWP must be in state 256 * LSRUN. 257 */ 258void 259sched_dequeue(struct lwp *l) 260{ 261 TAILQ_HEAD(, lwp) *q_head; 262 struct schedstate_percpu *spc; 263 const pri_t eprio = lwp_eprio(l); 264 265 spc = &l->l_cpu->ci_schedstate; 266 267 KASSERT(lwp_locked(l, spc->spc_mutex)); 268 KASSERT(eprio <= spc->spc_maxpriority); 269 KASSERT(spc->spc_bitmap[eprio >> BITMAP_SHIFT] != 0); 270 KASSERT(spc->spc_count > 0); 271 272 if (spc->spc_migrating == l) 273 spc->spc_migrating = NULL; 274 275 spc->spc_count--; 276 if ((l->l_pflag & LP_BOUND) == 0) { 277 atomic_store_relaxed(&spc->spc_mcount, 278 atomic_load_relaxed(&spc->spc_mcount) - 1); 279 } 280 281 q_head = sched_getrq(spc, eprio); 282 TAILQ_REMOVE(q_head, l, l_runq); 283 if (TAILQ_EMPTY(q_head)) { 284 u_int i; 285 uint32_t q; 286 287 /* Unmark bit */ 288 i = eprio >> BITMAP_SHIFT; 289 q = BITMAP_MSB >> (eprio & BITMAP_MASK); 290 KASSERT((spc->spc_bitmap[i] & q) != 0); 291 spc->spc_bitmap[i] &= ~q; 292 293 /* 294 * Update the value of highest priority in the runqueue, in a 295 * case it was a last thread in the queue of highest priority. 296 */ 297 if (eprio != spc->spc_maxpriority) 298 return; 299 300 do { 301 if (spc->spc_bitmap[i] != 0) { 302 q = ffs(spc->spc_bitmap[i]); 303 spc->spc_maxpriority = 304 (i << BITMAP_SHIFT) + (BITMAP_BITS - q); 305 return; 306 } 307 } while (i--); 308 309 /* If not found - set the lowest value */ 310 spc->spc_maxpriority = 0; 311 } 312} 313 314/* 315 * Cause a preemption on the given CPU, if the priority "pri" is higher 316 * priority than the running LWP. If "unlock" is specified, and ideally it 317 * will be for concurrency reasons, spc_mutex will be dropped before return. 318 */ 319void 320sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock) 321{ 322 struct schedstate_percpu *spc; 323 u_int o, n, f; 324 lwp_t *l; 325 326 spc = &ci->ci_schedstate; 327 328 KASSERT(mutex_owned(spc->spc_mutex)); 329 330 /* 331 * If the priority level we're evaluating wouldn't cause a new LWP 332 * to be run on the CPU, then we have nothing to do. 333 */ 334 if (pri <= spc->spc_curpriority || !mp_online) { 335 if (__predict_true(unlock)) { 336 spc_unlock(ci); 337 } 338 return; 339 } 340 341 /* 342 * Figure out what kind of preemption we should do. 343 */ 344 l = ci->ci_onproc; 345 if ((l->l_flag & LW_IDLE) != 0) { 346 f = RESCHED_IDLE | RESCHED_UPREEMPT; 347 } else if (pri >= sched_kpreempt_pri && (l->l_pflag & LP_INTR) == 0) { 348 /* We can't currently preempt softints - should be able to. */ 349#ifdef __HAVE_PREEMPTION 350 f = RESCHED_KPREEMPT; 351#else 352 /* Leave door open for test: set kpreempt_pri with sysctl. */ 353 f = RESCHED_UPREEMPT; 354#endif 355 /* 356 * l_dopreempt must be set with the CPU locked to sync with 357 * mi_switch(). It must also be set with an atomic to sync 358 * with kpreempt(). 359 */ 360 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_ACTIVE); 361 } else { 362 f = RESCHED_UPREEMPT; 363 } 364 if (ci != curcpu()) { 365 f |= RESCHED_REMOTE; 366 } 367 368 /* 369 * Things can start as soon as ci_want_resched is touched: x86 has 370 * an instruction that monitors the memory cell it's in. Drop the 371 * schedstate lock in advance, otherwise the remote CPU can awaken 372 * and immediately block on the lock. 373 */ 374 if (__predict_true(unlock)) { 375 spc_unlock(ci); 376 } 377 378 /* 379 * The caller almost always has a second scheduler lock held: either 380 * the running LWP lock (spc_lwplock), or a sleep queue lock. That 381 * keeps preemption disabled, which among other things ensures all 382 * LWPs involved won't be freed while we're here (see lwp_dtor()). 383 */ 384 KASSERT(kpreempt_disabled()); 385 386 for (o = 0;; o = n) { 387 n = atomic_cas_uint(&ci->ci_want_resched, o, o | f); 388 if (__predict_true(o == n)) { 389 /* 390 * We're the first to set a resched on the CPU. Try 391 * to avoid causing a needless trip through trap() 392 * to handle an AST fault, if it's known the LWP 393 * will either block or go through userret() soon. 394 */ 395 if (l != curlwp || cpu_intr_p()) { 396 cpu_need_resched(ci, l, f); 397 } 398 break; 399 } 400 if (__predict_true( 401 (n & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)) >= 402 (f & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)))) { 403 /* Already in progress, nothing to do. */ 404 break; 405 } 406 } 407} 408 409/* 410 * Cause a preemption on the given CPU, if the priority of LWP "l" in state 411 * LSRUN, is higher priority than the running LWP. If "unlock" is 412 * specified, and ideally it will be for concurrency reasons, spc_mutex will 413 * be dropped before return. 414 */ 415void 416sched_resched_lwp(struct lwp *l, bool unlock) 417{ 418 struct cpu_info *ci = l->l_cpu; 419 420 KASSERT(lwp_locked(l, ci->ci_schedstate.spc_mutex)); 421 KASSERT(l->l_stat == LSRUN); 422 423 sched_resched_cpu(ci, lwp_eprio(l), unlock); 424} 425 426/* 427 * Migration and balancing. 428 */ 429 430#ifdef MULTIPROCESSOR 431 432/* 433 * Estimate if LWP is cache-hot. 434 */ 435static inline bool 436lwp_cache_hot(const struct lwp *l) 437{ 438 439 /* Leave new LWPs in peace, determination has already been made. */ 440 if (l->l_stat == LSIDL) 441 return true; 442 443 if (__predict_false(l->l_slptime != 0 || l->l_rticks == 0)) 444 return false; 445 446 return (getticks() - l->l_rticks < mstohz(cacheht_time)); 447} 448 449/* 450 * Check if LWP can migrate to the chosen CPU. 451 */ 452static inline bool 453sched_migratable(const struct lwp *l, struct cpu_info *ci) 454{ 455 const struct schedstate_percpu *spc = &ci->ci_schedstate; 456 KASSERT(lwp_locked(__UNCONST(l), NULL)); 457 458 /* Is CPU offline? */ 459 if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) 460 return false; 461 462 /* Is affinity set? */ 463 if (__predict_false(l->l_affinity)) 464 return kcpuset_isset(l->l_affinity, cpu_index(ci)); 465 466 /* Is there a processor-set? */ 467 return (spc->spc_psid == l->l_psid); 468} 469 470/* 471 * A small helper to do round robin through CPU packages. 472 */ 473static struct cpu_info * 474sched_nextpkg(void) 475{ 476 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 477 478 spc->spc_nextpkg = 479 spc->spc_nextpkg->ci_sibling[CPUREL_PACKAGE1ST]; 480 481 return spc->spc_nextpkg; 482} 483 484/* 485 * Find a CPU to run LWP "l". Look for the CPU with the lowest priority 486 * thread. In case of equal priority, prefer first class CPUs, and amongst 487 * the remainder choose the CPU with the fewest runqueue entries. 488 * 489 * Begin the search in the CPU package which "pivot" is a member of. 490 */ 491static struct cpu_info * __noinline 492sched_bestcpu(struct lwp *l, struct cpu_info *pivot) 493{ 494 struct cpu_info *bestci, *curci, *outer; 495 struct schedstate_percpu *bestspc, *curspc; 496 pri_t bestpri, curpri; 497 498 /* 499 * If this fails (it shouldn't), run on the given CPU. This also 500 * gives us a weak preference for "pivot" to begin with. 501 */ 502 bestci = pivot; 503 bestspc = &bestci->ci_schedstate; 504 if (sched_migratable(l, bestci)) { 505 bestpri = MAX(bestspc->spc_curpriority, 506 bestspc->spc_maxpriority); 507 } else { 508 /* Invalidate the priority. */ 509 bestpri = PRI_COUNT; 510 } 511 512 /* In the outer loop scroll through all CPU packages. */ 513 pivot = pivot->ci_package1st; 514 outer = pivot; 515 do { 516 /* In the inner loop scroll through all CPUs in package. */ 517 curci = outer; 518 do { 519 if (!sched_migratable(l, curci)) { 520 continue; 521 } 522 523 curspc = &curci->ci_schedstate; 524 525 /* If this CPU is idle and 1st class, we're done. */ 526 if ((curspc->spc_flags & (SPCF_IDLE | SPCF_1STCLASS)) == 527 (SPCF_IDLE | SPCF_1STCLASS)) { 528 return curci; 529 } 530 531 curpri = MAX(curspc->spc_curpriority, 532 curspc->spc_maxpriority); 533 534 if (curpri > bestpri) { 535 continue; 536 } 537 if (curpri == bestpri) { 538 /* Prefer first class CPUs over others. */ 539 if ((curspc->spc_flags & SPCF_1STCLASS) == 0 && 540 (bestspc->spc_flags & SPCF_1STCLASS) != 0) { 541 continue; 542 } 543 /* 544 * Pick the least busy CPU. Make sure this is not 545 * <=, otherwise it defeats the above preference. 546 */ 547 if (bestspc->spc_count < curspc->spc_count) { 548 continue; 549 } 550 } 551 552 bestpri = curpri; 553 bestci = curci; 554 bestspc = curspc; 555 556 } while (curci = curci->ci_sibling[CPUREL_PACKAGE], 557 curci != outer); 558 } while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST], 559 outer != pivot); 560 561 return bestci; 562} 563 564/* 565 * Estimate the migration of LWP to the other CPU. 566 * Take and return the CPU, if migration is needed. 567 */ 568struct cpu_info * 569sched_takecpu(struct lwp *l) 570{ 571 struct schedstate_percpu *spc, *tspc; 572 struct cpu_info *ci, *curci, *tci; 573 pri_t eprio; 574 int flags; 575 576 KASSERT(lwp_locked(l, NULL)); 577 578 /* If thread is strictly bound, do not estimate other CPUs */ 579 ci = l->l_cpu; 580 if (l->l_pflag & LP_BOUND) 581 return ci; 582 583 spc = &ci->ci_schedstate; 584 eprio = lwp_eprio(l); 585 586 /* 587 * Handle new LWPs. For vfork() with a timeshared child, make it 588 * run on the same CPU as the parent if no other LWPs in queue. 589 * Otherwise scatter far and wide - try for an even distribution 590 * across all CPU packages and CPUs. 591 */ 592 if (l->l_stat == LSIDL) { 593 if (curlwp->l_vforkwaiting && l->l_class == SCHED_OTHER) { 594 if (sched_migratable(l, curlwp->l_cpu) && eprio > 595 curlwp->l_cpu->ci_schedstate.spc_maxpriority) { 596 return curlwp->l_cpu; 597 } 598 } else { 599 return sched_bestcpu(l, sched_nextpkg()); 600 } 601 flags = SPCF_IDLE; 602 } else { 603 flags = SPCF_IDLE | SPCF_1STCLASS; 604 } 605 606 /* 607 * Try to send the LWP back to the first CPU in the same core if 608 * idle. This keeps LWPs clustered in the run queues of 1st class 609 * CPUs. This implies stickiness. If we didn't find a home for 610 * a vfork() child above, try to use any SMT sibling to help out. 611 */ 612 tci = ci; 613 do { 614 tspc = &tci->ci_schedstate; 615 if ((tspc->spc_flags & flags) == flags && 616 sched_migratable(l, tci)) { 617 return tci; 618 } 619 tci = tci->ci_sibling[CPUREL_CORE]; 620 } while (tci != ci); 621 622 /* 623 * Otherwise the LWP is "sticky", i.e. generally preferring to stay 624 * on the same CPU. 625 */ 626 if (sched_migratable(l, ci) && (eprio > spc->spc_curpriority || 627 (lwp_cache_hot(l) && l->l_class == SCHED_OTHER))) { 628 return ci; 629 } 630 631 /* 632 * If the current CPU core is idle, run there and avoid the 633 * expensive scan of CPUs below. 634 */ 635 curci = curcpu(); 636 tci = curci; 637 do { 638 tspc = &tci->ci_schedstate; 639 if ((tspc->spc_flags & flags) == flags && 640 sched_migratable(l, tci)) { 641 return tci; 642 } 643 tci = tci->ci_sibling[CPUREL_CORE]; 644 } while (tci != curci); 645 646 /* 647 * Didn't find a new home above - happens infrequently. Start the 648 * search in last CPU package that the LWP ran in, but expand to 649 * include the whole system if needed. 650 */ 651 return sched_bestcpu(l, l->l_cpu); 652} 653 654/* 655 * Tries to catch an LWP from the runqueue of other CPU. 656 */ 657static struct lwp * 658sched_catchlwp(struct cpu_info *ci) 659{ 660 struct cpu_info *curci = curcpu(); 661 struct schedstate_percpu *spc, *curspc; 662 TAILQ_HEAD(, lwp) *q_head; 663 struct lwp *l; 664 bool gentle; 665 666 curspc = &curci->ci_schedstate; 667 spc = &ci->ci_schedstate; 668 669 /* 670 * Be more aggressive if this CPU is first class, and the other 671 * is not. 672 */ 673 gentle = ((curspc->spc_flags & SPCF_1STCLASS) == 0 || 674 (spc->spc_flags & SPCF_1STCLASS) != 0); 675 676 if (atomic_load_relaxed(&spc->spc_mcount) < (gentle ? min_catch : 1) || 677 curspc->spc_psid != spc->spc_psid) { 678 spc_unlock(ci); 679 return NULL; 680 } 681 682 /* Take the highest priority thread */ 683 q_head = sched_getrq(spc, spc->spc_maxpriority); 684 l = TAILQ_FIRST(q_head); 685 686 for (;;) { 687 /* Check the first and next result from the queue */ 688 if (l == NULL) { 689 break; 690 } 691 KASSERTMSG(l->l_stat == LSRUN, "%s l %p (%s) l_stat %d", 692 ci->ci_data.cpu_name, 693 l, (l->l_name ? l->l_name : l->l_proc->p_comm), l->l_stat); 694 695 /* Look for threads, whose are allowed to migrate */ 696 if ((l->l_pflag & LP_BOUND) || 697 (gentle && lwp_cache_hot(l)) || 698 !sched_migratable(l, curci)) { 699 l = TAILQ_NEXT(l, l_runq); 700 /* XXX Gap: could walk down priority list. */ 701 continue; 702 } 703 704 /* Grab the thread, and move to the local run queue */ 705 sched_dequeue(l); 706 l->l_cpu = curci; 707 lwp_unlock_to(l, curspc->spc_mutex); 708 sched_enqueue(l); 709 return l; 710 } 711 spc_unlock(ci); 712 713 return l; 714} 715 716/* 717 * Called from sched_idle() to handle migration. Return the CPU that we 718 * pushed the LWP to (may be NULL). 719 */ 720static struct cpu_info * 721sched_idle_migrate(void) 722{ 723 struct cpu_info *ci = curcpu(), *tci = NULL; 724 struct schedstate_percpu *spc, *tspc; 725 bool dlock = false; 726 727 spc = &ci->ci_schedstate; 728 spc_lock(ci); 729 for (;;) { 730 struct lwp *l; 731 732 l = spc->spc_migrating; 733 if (l == NULL) 734 break; 735 736 /* 737 * If second attempt, and target CPU has changed, 738 * drop the old lock. 739 */ 740 if (dlock == true && tci != l->l_target_cpu) { 741 KASSERT(tci != NULL); 742 spc_unlock(tci); 743 dlock = false; 744 } 745 746 /* 747 * Nothing to do if destination has changed to the 748 * local CPU, or migration was done by other CPU. 749 */ 750 tci = l->l_target_cpu; 751 if (tci == NULL || tci == ci) { 752 spc->spc_migrating = NULL; 753 l->l_target_cpu = NULL; 754 break; 755 } 756 tspc = &tci->ci_schedstate; 757 758 /* 759 * Double-lock the runqueues. 760 * We do that only once. 761 */ 762 if (dlock == false) { 763 dlock = true; 764 if (ci < tci) { 765 spc_lock(tci); 766 } else if (!mutex_tryenter(tspc->spc_mutex)) { 767 spc_unlock(ci); 768 spc_lock(tci); 769 spc_lock(ci); 770 /* Check the situation again.. */ 771 continue; 772 } 773 } 774 775 /* Migrate the thread */ 776 KASSERT(l->l_stat == LSRUN); 777 spc->spc_migrating = NULL; 778 l->l_target_cpu = NULL; 779 sched_dequeue(l); 780 l->l_cpu = tci; 781 lwp_setlock(l, tspc->spc_mutex); 782 sched_enqueue(l); 783 sched_resched_lwp(l, true); 784 /* tci now unlocked */ 785 spc_unlock(ci); 786 return tci; 787 } 788 if (dlock == true) { 789 KASSERT(tci != NULL); 790 spc_unlock(tci); 791 } 792 spc_unlock(ci); 793 return NULL; 794} 795 796/* 797 * Try to steal an LWP from "tci". 798 */ 799static bool 800sched_steal(struct cpu_info *ci, struct cpu_info *tci) 801{ 802 struct schedstate_percpu *spc, *tspc; 803 lwp_t *l; 804 805 spc = &ci->ci_schedstate; 806 tspc = &tci->ci_schedstate; 807 if (atomic_load_relaxed(&tspc->spc_mcount) != 0 && 808 spc->spc_psid == tspc->spc_psid) { 809 spc_dlock(ci, tci); 810 l = sched_catchlwp(tci); 811 spc_unlock(ci); 812 if (l != NULL) { 813 return true; 814 } 815 } 816 return false; 817} 818 819/* 820 * Called from each CPU's idle loop. 821 */ 822void 823sched_idle(void) 824{ 825 struct cpu_info *ci, *inner, *outer, *first, *tci, *mci; 826 struct schedstate_percpu *spc, *tspc; 827 struct lwp *l; 828 829 ci = curcpu(); 830 spc = &ci->ci_schedstate; 831 tci = NULL; 832 mci = NULL; 833 834 /* 835 * Handle LWP migrations off this CPU to another. If there a is 836 * migration to do then remember the CPU the LWP was sent to, and 837 * don't steal the LWP back from that CPU below. 838 */ 839 if (spc->spc_migrating != NULL) { 840 mci = sched_idle_migrate(); 841 } 842 843 /* If this CPU is offline, or we have an LWP to run, we're done. */ 844 if ((spc->spc_flags & SPCF_OFFLINE) != 0 || spc->spc_count != 0) { 845 return; 846 } 847 848 /* Deal with SMT. */ 849 if (ci->ci_nsibling[CPUREL_CORE] > 1) { 850 /* Try to help our siblings out. */ 851 tci = ci->ci_sibling[CPUREL_CORE]; 852 while (tci != ci) { 853 if (tci != mci && sched_steal(ci, tci)) { 854 return; 855 } 856 tci = tci->ci_sibling[CPUREL_CORE]; 857 } 858 /* 859 * If not the first SMT in the core, and in the default 860 * processor set, the search ends here. 861 */ 862 if ((spc->spc_flags & SPCF_1STCLASS) == 0 && 863 spc->spc_psid == PS_NONE) { 864 return; 865 } 866 } 867 868 /* 869 * Find something to run, unless this CPU exceeded the rate limit. 870 * Start looking on the current package to maximise L2/L3 cache 871 * locality. Then expand to looking at the rest of the system. 872 * 873 * XXX Should probably look at 2nd class CPUs first, but they will 874 * shed jobs via preempt() anyway. 875 */ 876 if (spc->spc_nextskim > getticks()) { 877 return; 878 } 879 spc->spc_nextskim = getticks() + mstohz(skim_interval); 880 881 /* In the outer loop scroll through all CPU packages, starting here. */ 882 first = ci->ci_package1st; 883 outer = first; 884 do { 885 /* In the inner loop scroll through all CPUs in package. */ 886 inner = outer; 887 do { 888 /* Don't hit the locks unless needed. */ 889 tspc = &inner->ci_schedstate; 890 if (ci == inner || ci == mci || 891 spc->spc_psid != tspc->spc_psid || 892 atomic_load_relaxed(&tspc->spc_mcount) < min_catch) { 893 continue; 894 } 895 spc_dlock(ci, inner); 896 l = sched_catchlwp(inner); 897 spc_unlock(ci); 898 if (l != NULL) { 899 /* Got it! */ 900 return; 901 } 902 } while (inner = inner->ci_sibling[CPUREL_PACKAGE], 903 inner != outer); 904 } while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST], 905 outer != first); 906} 907 908/* 909 * Called from mi_switch() when an LWP has been preempted / has yielded. 910 * The LWP is presently in the CPU's run queue. Here we look for a better 911 * CPU to teleport the LWP to; there may not be one. 912 */ 913void 914sched_preempted(struct lwp *l) 915{ 916 const int flags = SPCF_IDLE | SPCF_1STCLASS; 917 struct schedstate_percpu *tspc; 918 struct cpu_info *ci, *tci; 919 920 ci = l->l_cpu; 921 tspc = &ci->ci_schedstate; 922 923 KASSERT(tspc->spc_count >= 1); 924 925 /* 926 * Try to select another CPU if: 927 * 928 * - there is no migration pending already 929 * - and this LWP is running on a 2nd class CPU 930 * - or this LWP is a child of vfork() that has just done execve() 931 */ 932 if (l->l_target_cpu != NULL || 933 ((tspc->spc_flags & SPCF_1STCLASS) != 0 && 934 (l->l_pflag & LP_TELEPORT) == 0)) { 935 return; 936 } 937 938 /* 939 * Fast path: if the first SMT in the core is idle, send it back 940 * there, because the cache is shared (cheap) and we want all LWPs 941 * to be clustered on 1st class CPUs (either running there or on 942 * their runqueues). 943 */ 944 tci = ci->ci_sibling[CPUREL_CORE]; 945 while (tci != ci) { 946 tspc = &tci->ci_schedstate; 947 if ((tspc->spc_flags & flags) == flags && 948 sched_migratable(l, tci)) { 949 l->l_target_cpu = tci; 950 l->l_pflag &= ~LP_TELEPORT; 951 return; 952 } 953 tci = tci->ci_sibling[CPUREL_CORE]; 954 } 955 956 if ((l->l_pflag & LP_TELEPORT) != 0) { 957 /* 958 * A child of vfork(): now that the parent is released, 959 * scatter far and wide, to match the LSIDL distribution 960 * done in sched_takecpu(). 961 */ 962 l->l_pflag &= ~LP_TELEPORT; 963 tci = sched_bestcpu(l, sched_nextpkg()); 964 if (tci != ci) { 965 l->l_target_cpu = tci; 966 } 967 } else { 968 /* 969 * Try to find a better CPU to take it, but don't move to 970 * another 2nd class CPU, and don't move to a non-idle CPU, 971 * because that would prevent SMT being used to maximise 972 * throughput. 973 * 974 * Search in the current CPU package in order to try and 975 * keep L2/L3 cache locality, but expand to include the 976 * whole system if needed. 977 */ 978 tci = sched_bestcpu(l, l->l_cpu); 979 if (tci != ci && 980 (tci->ci_schedstate.spc_flags & flags) == flags) { 981 l->l_target_cpu = tci; 982 } 983 } 984} 985 986/* 987 * Called during execve() by a child of vfork(). Does two things: 988 * 989 * - If the parent has been awoken and put back on curcpu then give the 990 * CPU back to the parent. 991 * 992 * - If curlwp is not on a 1st class CPU then find somewhere else to run, 993 * since it dodged the distribution in sched_takecpu() when first set 994 * runnable. 995 */ 996void 997sched_vforkexec(struct lwp *l, bool samecpu) 998{ 999 1000 KASSERT(l == curlwp); 1001 if ((samecpu && ncpu > 1) || 1002 (l->l_cpu->ci_schedstate.spc_flags & SPCF_1STCLASS) == 0) { 1003 l->l_pflag |= LP_TELEPORT; 1004 preempt(); 1005 } 1006} 1007 1008#else 1009 1010/* 1011 * stubs for !MULTIPROCESSOR 1012 */ 1013 1014struct cpu_info * 1015sched_takecpu(struct lwp *l) 1016{ 1017 1018 return l->l_cpu; 1019} 1020 1021void 1022sched_idle(void) 1023{ 1024 1025} 1026 1027void 1028sched_preempted(struct lwp *l) 1029{ 1030 1031} 1032 1033void 1034sched_vforkexec(struct lwp *l, bool samecpu) 1035{ 1036 1037 KASSERT(l == curlwp); 1038} 1039 1040#endif /* MULTIPROCESSOR */ 1041 1042/* 1043 * Scheduling statistics and balancing. 1044 */ 1045void 1046sched_lwp_stats(struct lwp *l) 1047{ 1048 int batch; 1049 1050 KASSERT(lwp_locked(l, NULL)); 1051 1052 /* Update sleep time */ 1053 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP || 1054 l->l_stat == LSSUSPENDED) 1055 l->l_slptime++; 1056 1057 /* 1058 * Set that thread is more CPU-bound, if sum of run time exceeds the 1059 * sum of sleep time. Check if thread is CPU-bound a first time. 1060 */ 1061 batch = (l->l_rticksum > l->l_slpticksum); 1062 if (batch != 0) { 1063 if ((l->l_flag & LW_BATCH) == 0) 1064 batch = 0; 1065 l->l_flag |= LW_BATCH; 1066 } else 1067 l->l_flag &= ~LW_BATCH; 1068 1069 /* Reset the time sums */ 1070 l->l_slpticksum = 0; 1071 l->l_rticksum = 0; 1072 1073 /* Scheduler-specific hook */ 1074 sched_pstats_hook(l, batch); 1075#ifdef KDTRACE_HOOKS 1076 curthread = l; 1077#endif 1078} 1079 1080/* 1081 * Scheduler mill. 1082 */ 1083struct lwp * 1084sched_nextlwp(void) 1085{ 1086 struct cpu_info *ci = curcpu(); 1087 struct schedstate_percpu *spc; 1088 TAILQ_HEAD(, lwp) *q_head; 1089 struct lwp *l; 1090 1091 /* Update the last run time on switch */ 1092 l = curlwp; 1093 l->l_rticksum += (getticks() - l->l_rticks); 1094 1095 /* Return to idle LWP if there is a migrating thread */ 1096 spc = &ci->ci_schedstate; 1097 if (__predict_false(spc->spc_migrating != NULL)) 1098 return NULL; 1099 1100 /* Return to idle LWP if there is no runnable job */ 1101 if (__predict_false(spc->spc_count == 0)) 1102 return NULL; 1103 1104 /* Take the highest priority thread */ 1105 KASSERT(spc->spc_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]); 1106 q_head = sched_getrq(spc, spc->spc_maxpriority); 1107 l = TAILQ_FIRST(q_head); 1108 KASSERT(l != NULL); 1109 1110 sched_oncpu(l); 1111 l->l_rticks = getticks(); 1112 1113 return l; 1114} 1115 1116/* 1117 * sched_curcpu_runnable_p: return if curcpu() should exit the idle loop. 1118 */ 1119 1120bool 1121sched_curcpu_runnable_p(void) 1122{ 1123 const struct cpu_info *ci; 1124 const struct schedstate_percpu *spc; 1125 bool rv; 1126 1127 kpreempt_disable(); 1128 ci = curcpu(); 1129 spc = &ci->ci_schedstate; 1130 rv = (spc->spc_count != 0); 1131#ifndef __HAVE_FAST_SOFTINTS 1132 rv |= (ci->ci_data.cpu_softints != 0); 1133#endif 1134 kpreempt_enable(); 1135 1136 return rv; 1137} 1138 1139/* 1140 * Sysctl nodes and initialization. 1141 */ 1142 1143SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup") 1144{ 1145 const struct sysctlnode *node = NULL; 1146 1147 sysctl_createv(clog, 0, NULL, &node, 1148 CTLFLAG_PERMANENT, 1149 CTLTYPE_NODE, "sched", 1150 SYSCTL_DESCR("Scheduler options"), 1151 NULL, 0, NULL, 0, 1152 CTL_KERN, CTL_CREATE, CTL_EOL); 1153 1154 if (node == NULL) 1155 return; 1156 1157 sysctl_createv(clog, 0, &node, NULL, 1158 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 1159 CTLTYPE_INT, "cacheht_time", 1160 SYSCTL_DESCR("Cache hotness time (in ms)"), 1161 NULL, 0, &cacheht_time, 0, 1162 CTL_CREATE, CTL_EOL); 1163 sysctl_createv(clog, 0, &node, NULL, 1164 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 1165 CTLTYPE_INT, "skim_interval", 1166 SYSCTL_DESCR("Rate limit for stealing from other CPUs (in ms)"), 1167 NULL, 0, &skim_interval, 0, 1168 CTL_CREATE, CTL_EOL); 1169 sysctl_createv(clog, 0, &node, NULL, 1170 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 1171 CTLTYPE_INT, "min_catch", 1172 SYSCTL_DESCR("Minimal count of threads for catching"), 1173 NULL, 0, &min_catch, 0, 1174 CTL_CREATE, CTL_EOL); 1175 sysctl_createv(clog, 0, &node, NULL, 1176 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 1177 CTLTYPE_INT, "timesoftints", 1178 SYSCTL_DESCR("Track CPU time for soft interrupts"), 1179 NULL, 0, &softint_timing, 0, 1180 CTL_CREATE, CTL_EOL); 1181 sysctl_createv(clog, 0, &node, NULL, 1182 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 1183 CTLTYPE_INT, "kpreempt_pri", 1184 SYSCTL_DESCR("Minimum priority to trigger kernel preemption"), 1185 NULL, 0, &sched_kpreempt_pri, 0, 1186 CTL_CREATE, CTL_EOL); 1187} 1188 1189/* 1190 * Debugging. 1191 */ 1192 1193#ifdef DDB 1194 1195void 1196sched_print_runqueue(void (*pr)(const char *, ...)) 1197{ 1198 struct cpu_info *ci, *tci; 1199 struct schedstate_percpu *spc; 1200 struct lwp *l; 1201 struct proc *p; 1202 CPU_INFO_ITERATOR cii; 1203 1204 for (CPU_INFO_FOREACH(cii, ci)) { 1205 int i; 1206 1207 spc = &ci->ci_schedstate; 1208 1209 (*pr)("Run-queue (CPU = %u):\n", ci->ci_index); 1210 (*pr)(" pid.lid = %d.%d, r_count = %u, " 1211 "maxpri = %d, mlwp = %p\n", 1212#ifdef MULTIPROCESSOR 1213 ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid, 1214#else 1215 curlwp->l_proc->p_pid, curlwp->l_lid, 1216#endif 1217 spc->spc_count, spc->spc_maxpriority, 1218 spc->spc_migrating); 1219 i = (PRI_COUNT >> BITMAP_SHIFT) - 1; 1220 do { 1221 uint32_t q; 1222 q = spc->spc_bitmap[i]; 1223 (*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q); 1224 } while (i--); 1225 } 1226 1227 (*pr)(" %5s %4s %4s %10s %3s %18s %4s %4s %s\n", 1228 "LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "TCI", "LRTICKS"); 1229 1230 PROCLIST_FOREACH(p, &allproc) { 1231 (*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm); 1232 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1233 ci = l->l_cpu; 1234 tci = l->l_target_cpu; 1235 (*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %4d %u\n", 1236 (int)l->l_lid, l->l_priority, lwp_eprio(l), 1237 l->l_flag, l->l_stat == LSRUN ? "RQ" : 1238 (l->l_stat == LSSLEEP ? "SQ" : "-"), 1239 l, ci->ci_index, (tci ? tci->ci_index : -1), 1240 (u_int)(getticks() - l->l_rticks)); 1241 } 1242 } 1243} 1244 1245#endif 1246