1/* 2 * Copyright (c) 2013 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <mach/mach_types.h> 30#include <mach/machine.h> 31 32#include <machine/machine_routines.h> 33#include <machine/sched_param.h> 34#include <machine/machine_cpu.h> 35 36#include <kern/kern_types.h> 37#include <kern/debug.h> 38#include <kern/mach_param.h> 39#include <kern/machine.h> 40#include <kern/misc_protos.h> 41#include <kern/processor.h> 42#include <kern/queue.h> 43#include <kern/sched.h> 44#include <kern/sched_prim.h> 45#include <kern/task.h> 46#include <kern/thread.h> 47 48#include <sys/kdebug.h> 49 50/* 51 * Theory Statement 52 * 53 * How does the task scheduler work? 54 * 55 * It schedules threads across a few levels. 56 * 57 * RT threads are dealt with above us 58 * Bound threads go into the per-processor runq 59 * Non-bound threads are linked on their task's sched_group's runq 60 * sched_groups' sched_entries are linked on the pset's runq 61 * 62 * TODO: make this explicit - bound threads should have a different enqueue fxn 63 * 64 * When we choose a new thread, we will decide whether to look at the bound runqueue, the global runqueue 65 * or the current group's runqueue, then dequeue the next thread in that runqueue. 66 * 67 * We then manipulate the sched_entries to reflect the invariant that: 68 * Each non-empty priority level in a group's runq is represented by one sched_entry enqueued in the global 69 * runqueue. 70 * 71 * A sched_entry represents a chance at running - for each priority in each task, there is one chance of getting 72 * to run. This reduces the excess contention bonus given to processes which have work spread among many threads 73 * as compared to processes which do the same amount of work under fewer threads. 74 * 75 * NOTE: Currently, the multiq scheduler only supports one pset. 76 * 77 * NOTE ABOUT thread->sched_pri: 78 * 79 * It can change after enqueue - it's changed without pset lock but with thread lock if thread->runq is 0. 80 * Therefore we can only depend on it not changing during the enqueue and remove path, not the dequeue. 81 * 82 * TODO: Future features: 83 * 84 * Decouple the task priority from the sched_entry priority, allowing for: 85 * fast task priority change without having to iterate and re-dispatch all threads in the task. 86 * i.e. task-wide priority, task-wide boosting 87 * fancier group decay features 88 * 89 * Group (or task) decay: 90 * Decay is used for a few different things: 91 * Prioritizing latency-needing threads over throughput-needing threads for time-to-running 92 * Balancing work between threads in a process 93 * Balancing work done at the same priority between different processes 94 * Recovering from priority inversions between two threads in the same process 95 * Recovering from priority inversions between two threads in different processes 96 * Simulating a proportional share scheduler by allowing lower priority threads 97 * to run for a certain percentage of the time 98 * 99 * Task decay lets us separately address the 'same process' and 'different process' needs, 100 * which will allow us to make smarter tradeoffs in different cases. 101 * For example, we could resolve priority inversion in the same process by reordering threads without dropping the 102 * process below low priority threads in other processes. 103 * 104 * One lock to rule them all (or at least all the runqueues) instead of the pset locks 105 * 106 * Shrink sched_entry size to the size of a queue_chain_t by inferring priority, group, and perhaps runq field. 107 * The entries array is 5K currently so it'd be really great to reduce. 108 * One way to get sched_group below 4K without a new runq structure would be to remove the extra queues above realtime. 109 * 110 * When preempting a processor, store a flag saying if the preemption 111 * was from a thread in the same group or different group, 112 * and tell choose_thread about it. 113 * 114 * When choosing a processor, bias towards those running in the same 115 * group as I am running (at the same priority, or within a certain band?). 116 * 117 * Decide if we need to support psets. 118 * Decide how to support psets - do we need duplicate entries for each pset, 119 * or can we get away with putting the entry in either one or the other pset? 120 * 121 * Consider the right way to handle runq count - I don't want to iterate groups. 122 * Perhaps keep a global counter. sched_run_count will not work. 123 * Alternate option - remove it from choose_processor. It doesn't add much value 124 * now that we have global runq. 125 * 126 * Need a better way of finding group to target instead of looking at current_task. 127 * Perhaps choose_thread could pass in the current thread? 128 * 129 * Consider unifying runq copy-pastes. 130 * 131 * Thoughts on having a group central quantum bucket: 132 * 133 * I see two algorithms to decide quanta: 134 * A) Hand off only when switching thread to thread in the same group 135 * B) Allocate and return quanta to the group's pool 136 * 137 * Issues: 138 * If a task blocks completely, should it come back with the leftover quanta 139 * or brand new quanta? 140 * 141 * Should I put a flag saying zero out a quanta you grab when youre dispatched'? 142 * 143 * Resolution: 144 * Handing off quanta between threads will help with jumping around in the current task 145 * but will not help when a thread from a different task is involved. 146 * Need an algorithm that works with round robin-ing between threads in different tasks 147 * 148 * But wait - round robining can only be triggered by quantum expire or blocking. 149 * We need something that works with preemption or yielding - that's the more interesting idea. 150 * 151 * Existing algorithm - preemption doesn't re-set quantum, puts thread on head of runq. 152 * Blocking or quantum expiration does re-set quantum, puts thread on tail of runq. 153 * 154 * New algorithm - 155 * Hand off quanta when hopping between threads with same sched_group 156 * Even if thread was blocked it uses last thread remaining quanta when it starts. 157 * 158 * If we use the only cycle entry at quantum algorithm, then the quantum pool starts getting 159 * interesting. 160 * 161 * A thought - perhaps the handoff approach doesn't work so well in the presence of 162 * non-handoff wakeups i.e. wake other thread then wait then block - doesn't mean that 163 * woken thread will be what I switch to - other processor may have stolen it. 164 * What do we do there? 165 * 166 * Conclusions: 167 * We currently don't know of a scenario where quantum buckets on the task is beneficial. 168 * We will instead handoff quantum between threads in the task, and keep quantum 169 * on the preempted thread if it's preempted by something outside the task. 170 * 171 */ 172 173#if DEBUG || DEVELOPMENT 174#define MULTIQ_SANITY_CHECK 175#endif 176 177typedef struct sched_entry { 178 queue_chain_t links; 179 int16_t sched_pri; /* scheduled (current) priority */ 180 int16_t runq; 181 int32_t pad; 182} *sched_entry_t; 183 184typedef run_queue_t entry_queue_t; /* A run queue that holds sched_entries instead of threads */ 185typedef run_queue_t group_runq_t; /* A run queue that is part of a sched_group */ 186 187#define SCHED_ENTRY_NULL ((sched_entry_t) 0) 188#define MULTIQ_ERUNQ (-4) /* Indicates entry is on the main runq */ 189 190/* Each level in the run queue corresponds to one entry in the entries array */ 191struct sched_group { 192 struct sched_entry entries[NRQS]; 193 struct run_queue runq; 194 queue_chain_t sched_groups; 195}; 196 197/* TODO: Turn this into an attribute in the sched dispatch struct */ 198boolean_t sched_groups_enabled = FALSE; 199 200/* 201 * Keep entry on the head of the runqueue while dequeueing threads. 202 * Only cycle it to the end of the runqueue when a thread in the task 203 * hits its quantum. 204 */ 205static boolean_t deep_drain = FALSE; 206 207/* 208 * Don't favor the task when an urgent thread is present. 209 */ 210static boolean_t drain_urgent_first = TRUE; 211 212/* Verify the consistency of the runq before touching it */ 213static boolean_t multiq_sanity_check = FALSE; 214 215/* 216 * Draining threads from the current task is preferred 217 * when they're less than X steps below the current 218 * global highest priority 219 */ 220#define DEFAULT_DRAIN_BAND_LIMIT MAXPRI 221static integer_t drain_band_limit; 222 223/* 224 * Don't go below this priority level if there is something above it in another task 225 */ 226#define DEFAULT_DRAIN_DEPTH_LIMIT MAXPRI_THROTTLE 227static integer_t drain_depth_limit; 228 229 230static struct zone *sched_group_zone; 231 232static uint64_t num_sched_groups = 0; 233static queue_head_t sched_groups; 234 235static lck_attr_t sched_groups_lock_attr; 236static lck_grp_t sched_groups_lock_grp; 237static lck_grp_attr_t sched_groups_lock_grp_attr; 238 239static lck_mtx_t sched_groups_lock; 240 241 242static void 243sched_multiq_init(void); 244 245static thread_t 246sched_multiq_steal_thread(processor_set_t pset); 247 248static void 249sched_multiq_thread_update_scan(void); 250 251static boolean_t 252sched_multiq_processor_enqueue(processor_t processor, thread_t thread, integer_t options); 253 254static boolean_t 255sched_multiq_processor_queue_remove(processor_t processor, thread_t thread); 256 257void 258sched_multiq_quantum_expire(thread_t thread); 259 260static ast_t 261sched_multiq_processor_csw_check(processor_t processor); 262 263static boolean_t 264sched_multiq_processor_queue_has_priority(processor_t processor, int priority, boolean_t gte); 265 266static int 267sched_multiq_runq_count(processor_t processor); 268 269static boolean_t 270sched_multiq_processor_queue_empty(processor_t processor); 271 272static uint64_t 273sched_multiq_runq_stats_count_sum(processor_t processor); 274 275static int 276sched_multiq_processor_bound_count(processor_t processor); 277 278static void 279sched_multiq_pset_init(processor_set_t pset); 280 281static void 282sched_multiq_processor_init(processor_t processor); 283 284static thread_t 285sched_multiq_choose_thread(processor_t processor, int priority, ast_t reason); 286 287static void 288sched_multiq_processor_queue_shutdown(processor_t processor); 289 290static sched_mode_t 291sched_multiq_initial_thread_sched_mode(task_t parent_task); 292 293static boolean_t 294sched_multiq_should_current_thread_rechoose_processor(processor_t processor); 295 296const struct sched_dispatch_table sched_multiq_dispatch = { 297 .init = sched_multiq_init, 298 .timebase_init = sched_traditional_timebase_init, 299 .processor_init = sched_multiq_processor_init, 300 .pset_init = sched_multiq_pset_init, 301 .maintenance_continuation = sched_traditional_maintenance_continue, 302 .choose_thread = sched_multiq_choose_thread, 303 .steal_thread = sched_multiq_steal_thread, 304 .compute_priority = compute_priority, 305 .choose_processor = choose_processor, 306 .processor_enqueue = sched_multiq_processor_enqueue, 307 .processor_queue_shutdown = sched_multiq_processor_queue_shutdown, 308 .processor_queue_remove = sched_multiq_processor_queue_remove, 309 .processor_queue_empty = sched_multiq_processor_queue_empty, 310 .priority_is_urgent = priority_is_urgent, 311 .processor_csw_check = sched_multiq_processor_csw_check, 312 .processor_queue_has_priority = sched_multiq_processor_queue_has_priority, 313 .initial_quantum_size = sched_traditional_initial_quantum_size, 314 .initial_thread_sched_mode = sched_multiq_initial_thread_sched_mode, 315 .can_update_priority = can_update_priority, 316 .update_priority = update_priority, 317 .lightweight_update_priority = lightweight_update_priority, 318 .quantum_expire = sched_multiq_quantum_expire, 319 .should_current_thread_rechoose_processor = sched_multiq_should_current_thread_rechoose_processor, 320 .processor_runq_count = sched_multiq_runq_count, 321 .processor_runq_stats_count_sum = sched_multiq_runq_stats_count_sum, 322 .fairshare_init = sched_traditional_fairshare_init, 323 .fairshare_runq_count = sched_traditional_fairshare_runq_count, 324 .fairshare_runq_stats_count_sum = sched_traditional_fairshare_runq_stats_count_sum, 325 .fairshare_enqueue = sched_traditional_fairshare_enqueue, 326 .fairshare_dequeue = sched_traditional_fairshare_dequeue, 327 .fairshare_queue_remove = sched_traditional_fairshare_queue_remove, 328 .processor_bound_count = sched_multiq_processor_bound_count, 329 .thread_update_scan = sched_multiq_thread_update_scan, 330 .direct_dispatch_to_idle_processors = FALSE, 331}; 332 333 334static void 335sched_multiq_init(void) 336{ 337 sched_groups_enabled = TRUE; 338 339#if defined(MULTIQ_SANITY_CHECK) 340 PE_parse_boot_argn("-multiq-sanity-check", &multiq_sanity_check, sizeof(multiq_sanity_check)); 341#endif 342 343 PE_parse_boot_argn("-multiq-deep-drain", &deep_drain, sizeof(deep_drain)); 344 345 PE_parse_boot_argn("multiq_drain_urgent_first", &drain_urgent_first, sizeof(drain_urgent_first)); 346 347 if (!PE_parse_boot_argn("multiq_drain_depth_limit", &drain_depth_limit, sizeof(drain_depth_limit))) { 348 drain_depth_limit = DEFAULT_DRAIN_DEPTH_LIMIT; 349 } 350 351 if (!PE_parse_boot_argn("multiq_drain_band_limit", &drain_band_limit, sizeof(drain_band_limit))) { 352 drain_band_limit = DEFAULT_DRAIN_BAND_LIMIT; 353 } 354 355 printf("multiq scheduler config: deep-drain %d, urgent first %d, depth limit %d, band limit %d, sanity check %d\n", 356 deep_drain, drain_urgent_first, drain_depth_limit, drain_band_limit, multiq_sanity_check); 357 358 sched_group_zone = zinit( 359 sizeof(struct sched_group), 360 task_max * sizeof(struct sched_group), 361 PAGE_SIZE, 362 "sched groups"); 363 364 zone_change(sched_group_zone, Z_NOENCRYPT, TRUE); 365 zone_change(sched_group_zone, Z_NOCALLOUT, TRUE); 366 367 queue_init(&sched_groups); 368 369 lck_grp_attr_setdefault(&sched_groups_lock_grp_attr); 370 lck_grp_init(&sched_groups_lock_grp, "sched_groups", &sched_groups_lock_grp_attr); 371 lck_attr_setdefault(&sched_groups_lock_attr); 372 lck_mtx_init(&sched_groups_lock, &sched_groups_lock_grp, &sched_groups_lock_attr); 373 374 sched_traditional_init(); 375} 376 377static void 378sched_multiq_processor_init(processor_t processor) 379{ 380 run_queue_init(&processor->runq); 381} 382 383static void 384sched_multiq_pset_init(processor_set_t pset) 385{ 386 run_queue_init(&pset->pset_runq); 387} 388 389static sched_mode_t 390sched_multiq_initial_thread_sched_mode(task_t parent_task) 391{ 392 if (parent_task == kernel_task) 393 return TH_MODE_FIXED; 394 else 395 return TH_MODE_TIMESHARE; 396} 397 398sched_group_t 399sched_group_create(void) 400{ 401 sched_group_t sched_group; 402 403 if (!sched_groups_enabled) 404 return SCHED_GROUP_NULL; 405 406 sched_group = (sched_group_t)zalloc(sched_group_zone); 407 408 bzero(sched_group, sizeof(struct sched_group)); 409 410 run_queue_init(&sched_group->runq); 411 412 for (int i = 0; i < NRQS; i++) { 413 sched_group->entries[i].runq = 0; 414 sched_group->entries[i].sched_pri = i; 415 } 416 417 lck_mtx_lock(&sched_groups_lock); 418 queue_enter(&sched_groups, sched_group, sched_group_t, sched_groups); 419 num_sched_groups++; 420 lck_mtx_unlock(&sched_groups_lock); 421 422 return (sched_group); 423} 424 425void 426sched_group_destroy(sched_group_t sched_group) 427{ 428 if (!sched_groups_enabled) { 429 assert(sched_group == SCHED_GROUP_NULL); 430 return; 431 } 432 433 assert(sched_group != SCHED_GROUP_NULL); 434 assert(sched_group->runq.count == 0); 435 436 for (int i = 0; i < NRQS; i++) { 437 assert(sched_group->entries[i].runq == 0); 438 assert(sched_group->entries[i].sched_pri == i); 439 } 440 441 lck_mtx_lock(&sched_groups_lock); 442 queue_remove(&sched_groups, sched_group, sched_group_t, sched_groups); 443 num_sched_groups--; 444 lck_mtx_unlock(&sched_groups_lock); 445 446 zfree(sched_group_zone, sched_group); 447} 448 449__attribute__((always_inline)) 450static inline entry_queue_t 451multiq_main_entryq(processor_t processor) 452{ 453 return (entry_queue_t)&processor->processor_set->pset_runq; 454} 455 456__attribute__((always_inline)) 457static inline run_queue_t 458multiq_bound_runq(processor_t processor) 459{ 460 return &processor->runq; 461} 462 463__attribute__((always_inline)) 464static inline sched_entry_t 465group_entry_for_pri(sched_group_t group, integer_t pri) 466{ 467 return &group->entries[pri]; 468} 469 470__attribute__((always_inline)) 471static inline sched_group_t 472group_for_entry(sched_entry_t entry) 473{ 474 sched_group_t group = (sched_group_t)(entry - entry->sched_pri); 475 return group; 476} 477 478/* Peek at the head of the runqueue */ 479static sched_entry_t 480entry_queue_first_entry(entry_queue_t rq) 481{ 482 assert(rq->count != 0); 483 484 queue_t queue = rq->queues + rq->highq; 485 486 sched_entry_t entry = (sched_entry_t)queue_first(queue); 487 488 assert(entry->sched_pri == rq->highq); 489 490 return entry; 491} 492 493#if defined(MULTIQ_SANITY_CHECK) 494 495__attribute__((always_inline)) 496static inline boolean_t 497queue_chain_linked(queue_chain_t* chain) 498{ 499 if (chain->next != NULL) { 500 assert(chain->prev != NULL); 501 return TRUE; 502 } else { 503 assert(chain->prev == NULL); 504 return FALSE; 505 } 506} 507 508static thread_t 509group_first_thread(sched_group_t group) 510{ 511 group_runq_t rq = &group->runq; 512 513 assert(rq->count != 0); 514 515 queue_t queue = rq->queues + rq->highq; 516 517 thread_t thread = (thread_t)(void*)queue_first(queue); 518 519 assert(thread != THREAD_NULL); 520 521 assert(thread->sched_group == group); 522 523 /* TODO: May not be safe */ 524 assert(thread->sched_pri == rq->highq); 525 526 return thread; 527} 528 529/* Asserts if entry is not in entry runq at pri */ 530static void 531entry_queue_check_entry(entry_queue_t runq, sched_entry_t entry, int expected_pri) 532{ 533 queue_t q; 534 sched_entry_t elem; 535 536 assert(queue_chain_linked(&entry->links)); 537 assert(entry->runq == MULTIQ_ERUNQ); 538 539 q = &runq->queues[expected_pri]; 540 541 queue_iterate(q, elem, sched_entry_t, links) { 542 if (elem == entry) 543 return; 544 } 545 546 panic("runq %p doesn't contain entry %p at pri %d", runq, entry, expected_pri); 547} 548 549/* Asserts if thread is not in group at its priority */ 550static void 551sched_group_check_thread(sched_group_t group, thread_t thread) 552{ 553 queue_t q; 554 thread_t elem; 555 int pri = thread->sched_pri; 556 557 assert(thread->runq != PROCESSOR_NULL); 558 559 q = &group->runq.queues[pri]; 560 561 queue_iterate(q, elem, thread_t, links) { 562 if (elem == thread) 563 return; 564 } 565 566 panic("group %p doesn't contain thread %p at pri %d", group, thread, pri); 567} 568 569static void 570global_check_entry_queue(entry_queue_t main_entryq) 571{ 572 if (main_entryq->count == 0) 573 return; 574 575 sched_entry_t entry = entry_queue_first_entry(main_entryq); 576 577 assert(entry->runq == MULTIQ_ERUNQ); 578 579 sched_group_t group = group_for_entry(entry); 580 581 thread_t thread = group_first_thread(group); 582 583 __assert_only sched_entry_t thread_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri); 584 585 assert(entry->sched_pri == group->runq.highq); 586 587 assert(entry == thread_entry); 588 assert(thread->runq != PROCESSOR_NULL); 589} 590 591static void 592group_check_run_queue(entry_queue_t main_entryq, sched_group_t group) 593{ 594 if (group->runq.count == 0) 595 return; 596 597 thread_t thread = group_first_thread(group); 598 599 assert(thread->runq != PROCESSOR_NULL); 600 601 sched_entry_t sched_entry = group_entry_for_pri(thread->sched_group, thread->sched_pri); 602 603 entry_queue_check_entry(main_entryq, sched_entry, thread->sched_pri); 604 605 assert(sched_entry->sched_pri == thread->sched_pri); 606 assert(sched_entry->runq == MULTIQ_ERUNQ); 607} 608 609#endif /* defined(MULTIQ_SANITY_CHECK) */ 610 611/* 612 * The run queue must not be empty. 613 */ 614static sched_entry_t 615entry_queue_dequeue_entry(entry_queue_t rq) 616{ 617 sched_entry_t sched_entry; 618 queue_t queue = rq->queues + rq->highq; 619 620 assert(rq->count > 0); 621 assert(!queue_empty(queue)); 622 623 sched_entry = (sched_entry_t)dequeue_head(queue); 624 625 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); 626 rq->count--; 627 if (SCHED(priority_is_urgent)(rq->highq)) { 628 rq->urgency--; assert(rq->urgency >= 0); 629 } 630 if (queue_empty(queue)) { 631 if (rq->highq != IDLEPRI) 632 clrbit(MAXPRI - rq->highq, rq->bitmap); 633 rq->highq = MAXPRI - ffsbit(rq->bitmap); 634 } 635 636 sched_entry->runq = 0; 637 638 return (sched_entry); 639} 640 641/* 642 * The run queue must not be empty. 643 */ 644static boolean_t 645entry_queue_enqueue_entry( 646 entry_queue_t rq, 647 sched_entry_t entry, 648 integer_t options) 649{ 650 int sched_pri = entry->sched_pri; 651 queue_t queue = rq->queues + sched_pri; 652 boolean_t result = FALSE; 653 654 assert(entry->runq == 0); 655 656 if (queue_empty(queue)) { 657 enqueue_tail(queue, (queue_entry_t)entry); 658 659 setbit(MAXPRI - sched_pri, rq->bitmap); 660 if (sched_pri > rq->highq) { 661 rq->highq = sched_pri; 662 result = TRUE; 663 } 664 } else { 665 if (options & SCHED_TAILQ) 666 enqueue_tail(queue, (queue_entry_t)entry); 667 else 668 enqueue_head(queue, (queue_entry_t)entry); 669 } 670 if (SCHED(priority_is_urgent)(sched_pri)) 671 rq->urgency++; 672 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); 673 rq->count++; 674 675 entry->runq = MULTIQ_ERUNQ; 676 677 return (result); 678} 679 680/* 681 * The entry must be in this runqueue. 682 */ 683static void 684entry_queue_remove_entry( 685 entry_queue_t rq, 686 sched_entry_t entry) 687{ 688 int sched_pri = entry->sched_pri; 689 690#if defined(MULTIQ_SANITY_CHECK) 691 if (multiq_sanity_check) { 692 entry_queue_check_entry(rq, entry, sched_pri); 693 } 694#endif 695 696 remqueue((queue_entry_t)entry); 697 698 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); 699 rq->count--; 700 if (SCHED(priority_is_urgent)(sched_pri)) { 701 rq->urgency--; assert(rq->urgency >= 0); 702 } 703 704 if (queue_empty(rq->queues + sched_pri)) { 705 /* update run queue status */ 706 if (sched_pri != IDLEPRI) 707 clrbit(MAXPRI - sched_pri, rq->bitmap); 708 rq->highq = MAXPRI - ffsbit(rq->bitmap); 709 } 710 711 entry->runq = 0; 712} 713 714/* 715 * The run queue must not be empty. 716 * 717 * sets queue_empty to TRUE if queue is now empty at thread_pri 718 */ 719static thread_t 720group_run_queue_dequeue_thread( 721 group_runq_t rq, 722 integer_t *thread_pri, 723 boolean_t *queue_empty) 724{ 725 thread_t thread; 726 queue_t queue = rq->queues + rq->highq; 727 728 assert(rq->count > 0); 729 assert(!queue_empty(queue)); 730 731 *thread_pri = rq->highq; 732 733 thread = (thread_t)(void*)dequeue_head(queue); 734 735 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); 736 rq->count--; 737 if (SCHED(priority_is_urgent)(rq->highq)) { 738 rq->urgency--; assert(rq->urgency >= 0); 739 } 740 if (queue_empty(queue)) { 741 if (rq->highq != IDLEPRI) 742 clrbit(MAXPRI - rq->highq, rq->bitmap); 743 rq->highq = MAXPRI - ffsbit(rq->bitmap); 744 *queue_empty = TRUE; 745 } else { 746 *queue_empty = FALSE; 747 } 748 749 return (thread); 750} 751 752/* 753 * The run queue must not be empty. 754 * returns TRUE if queue was empty at thread_pri 755 */ 756static boolean_t 757group_run_queue_enqueue_thread( 758 group_runq_t rq, 759 thread_t thread, 760 integer_t thread_pri, 761 integer_t options) 762{ 763 queue_t queue = rq->queues + thread_pri; 764 boolean_t result = FALSE; 765 766 assert(thread->runq == PROCESSOR_NULL); 767 768 if (queue_empty(queue)) { 769 enqueue_tail(queue, (queue_entry_t)thread); 770 771 setbit(MAXPRI - thread_pri, rq->bitmap); 772 if (thread_pri > rq->highq) { 773 rq->highq = thread_pri; 774 } 775 result = TRUE; 776 } else { 777 if (options & SCHED_TAILQ) 778 enqueue_tail(queue, (queue_entry_t)thread); 779 else 780 enqueue_head(queue, (queue_entry_t)thread); 781 } 782 if (SCHED(priority_is_urgent)(thread_pri)) 783 rq->urgency++; 784 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); 785 rq->count++; 786 787 return (result); 788} 789 790/* 791 * The thread must be in this runqueue. 792 * returns TRUE if queue is now empty at thread_pri 793 */ 794static boolean_t 795group_run_queue_remove_thread( 796 group_runq_t rq, 797 thread_t thread, 798 integer_t thread_pri) 799{ 800 boolean_t result = FALSE; 801 802 assert(thread->runq != PROCESSOR_NULL); 803 804 remqueue((queue_entry_t)thread); 805 806 SCHED_STATS_RUNQ_CHANGE(&rq->runq_stats, rq->count); 807 rq->count--; 808 if (SCHED(priority_is_urgent)(thread_pri)) { 809 rq->urgency--; assert(rq->urgency >= 0); 810 } 811 812 if (queue_empty(rq->queues + thread_pri)) { 813 /* update run queue status */ 814 if (thread_pri != IDLEPRI) 815 clrbit(MAXPRI - thread_pri, rq->bitmap); 816 rq->highq = MAXPRI - ffsbit(rq->bitmap); 817 result = TRUE; 818 } 819 820 thread->runq = PROCESSOR_NULL; 821 822 return result; 823} 824 825/* 826 * A thread's sched pri may change out from under us because 827 * we're clearing thread->runq here without the thread locked. 828 * Do not rely on it to be the same as when we enqueued. 829 */ 830static thread_t 831sched_global_dequeue_thread(entry_queue_t main_entryq) 832{ 833 boolean_t pri_level_empty = FALSE; 834 sched_entry_t entry; 835 group_runq_t group_runq; 836 thread_t thread; 837 integer_t thread_pri; 838 sched_group_t group; 839 840 assert(main_entryq->count > 0); 841 842 entry = entry_queue_dequeue_entry(main_entryq); 843 844 group = group_for_entry(entry); 845 group_runq = &group->runq; 846 847 thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty); 848 849 thread->runq = PROCESSOR_NULL; 850 851 if (!pri_level_empty) { 852 entry_queue_enqueue_entry(main_entryq, entry, SCHED_TAILQ); 853 } 854 855 return thread; 856} 857 858/* Dequeue a thread from the global runq without moving the entry */ 859static thread_t 860sched_global_deep_drain_dequeue_thread(entry_queue_t main_entryq) 861{ 862 boolean_t pri_level_empty = FALSE; 863 sched_entry_t entry; 864 group_runq_t group_runq; 865 thread_t thread; 866 integer_t thread_pri; 867 sched_group_t group; 868 869 assert(main_entryq->count > 0); 870 871 entry = entry_queue_first_entry(main_entryq); 872 873 group = group_for_entry(entry); 874 group_runq = &group->runq; 875 876 thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty); 877 878 thread->runq = PROCESSOR_NULL; 879 880 if (pri_level_empty) { 881 entry_queue_remove_entry(main_entryq, entry); 882 } 883 884 return thread; 885} 886 887 888static thread_t 889sched_group_dequeue_thread( 890 entry_queue_t main_entryq, 891 sched_group_t group) 892{ 893 group_runq_t group_runq = &group->runq; 894 boolean_t pri_level_empty = FALSE; 895 thread_t thread; 896 integer_t thread_pri; 897 898 thread = group_run_queue_dequeue_thread(group_runq, &thread_pri, &pri_level_empty); 899 900 thread->runq = PROCESSOR_NULL; 901 902 if (pri_level_empty) { 903 entry_queue_remove_entry(main_entryq, group_entry_for_pri(group, thread_pri)); 904 } 905 906 return thread; 907} 908 909static void 910sched_group_remove_thread( 911 entry_queue_t main_entryq, 912 sched_group_t group, 913 thread_t thread) 914{ 915 integer_t thread_pri = thread->sched_pri; 916 sched_entry_t sched_entry = group_entry_for_pri(group, thread_pri); 917 918#if defined(MULTIQ_SANITY_CHECK) 919 if (multiq_sanity_check) { 920 global_check_entry_queue(main_entryq); 921 group_check_run_queue(main_entryq, group); 922 923 sched_group_check_thread(group, thread); 924 entry_queue_check_entry(main_entryq, sched_entry, thread_pri); 925 } 926#endif 927 928 boolean_t pri_level_empty = group_run_queue_remove_thread(&group->runq, thread, thread_pri); 929 930 if (pri_level_empty) { 931 entry_queue_remove_entry(main_entryq, sched_entry); 932 } 933 934#if defined(MULTIQ_SANITY_CHECK) 935 if (multiq_sanity_check) { 936 global_check_entry_queue(main_entryq); 937 group_check_run_queue(main_entryq, group); 938 } 939#endif 940} 941 942static void 943sched_group_enqueue_thread( 944 entry_queue_t main_entryq, 945 sched_group_t group, 946 thread_t thread, 947 integer_t options) 948{ 949#if defined(MULTIQ_SANITY_CHECK) 950 if (multiq_sanity_check) { 951 global_check_entry_queue(main_entryq); 952 group_check_run_queue(main_entryq, group); 953 } 954#endif 955 956 int sched_pri = thread->sched_pri; 957 958 boolean_t pri_level_was_empty = group_run_queue_enqueue_thread(&group->runq, thread, sched_pri, options); 959 960 if (pri_level_was_empty) { 961 /* 962 * TODO: Need to figure out if passing options here is a good idea or not 963 * What effects would it have? 964 */ 965 entry_queue_enqueue_entry(main_entryq, &group->entries[sched_pri], options); 966 } 967} 968 969/* 970 * Locate a thread to execute from the run queue and return it. 971 * Only choose a thread with greater or equal priority. 972 * 973 * pset is locked, thread is not locked. 974 * 975 * Returns THREAD_NULL if it cannot find a valid thread. 976 * 977 * Note: we cannot rely on the value of thread->sched_pri in this path because 978 * we don't have the thread locked. 979 * 980 * TODO: Remove tracepoints 981 */ 982static thread_t 983sched_multiq_choose_thread( 984 processor_t processor, 985 int priority, 986 ast_t reason) 987{ 988 entry_queue_t main_entryq = multiq_main_entryq(processor); 989 run_queue_t bound_runq = multiq_bound_runq(processor); 990 991 boolean_t choose_bound_runq = FALSE; 992 993 if (bound_runq->highq < priority && 994 main_entryq->highq < priority) 995 return THREAD_NULL; 996 997 if (bound_runq->count && main_entryq->count) { 998 if (bound_runq->highq >= main_entryq->highq) { 999 choose_bound_runq = TRUE; 1000 } else { 1001 /* Use main runq */ 1002 } 1003 } else if (bound_runq->count) { 1004 choose_bound_runq = TRUE; 1005 } else if (main_entryq->count) { 1006 /* Use main runq */ 1007 } else { 1008 return (THREAD_NULL); 1009 } 1010 1011 if (choose_bound_runq) { 1012 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 1013 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE, 1014 MACH_MULTIQ_BOUND, main_entryq->highq, bound_runq->highq, 0, 0); 1015 1016 return run_queue_dequeue(bound_runq, SCHED_HEADQ); 1017 } 1018 1019 sched_group_t group = current_thread()->sched_group; 1020 1021#if defined(MULTIQ_SANITY_CHECK) 1022 if (multiq_sanity_check) { 1023 global_check_entry_queue(main_entryq); 1024 group_check_run_queue(main_entryq, group); 1025 } 1026#endif 1027 1028 /* 1029 * Determine if we should look at the group or the global queue 1030 * 1031 * TODO: 1032 * Perhaps pass reason as a 'should look inside' argument to choose_thread 1033 * Should YIELD AST override drain limit? 1034 */ 1035 if (group->runq.count != 0 && (reason & AST_PREEMPTION) == 0) { 1036 boolean_t drain_limit_hit = FALSE; 1037 1038 if (main_entryq->highq > group->runq.highq) { 1039 /* 1040 * If there's something elsewhere above the depth limit, 1041 * don't pick a thread below the limit. 1042 */ 1043 if (main_entryq->highq > drain_depth_limit && 1044 group->runq.highq <= drain_depth_limit) 1045 drain_limit_hit = TRUE; 1046 1047 /* 1048 * Don't go more than X steps below the global highest 1049 */ 1050 if ((main_entryq->highq - group->runq.highq) >= drain_band_limit) 1051 drain_limit_hit = TRUE; 1052 1053 /* Don't favor the task when an urgent thread is present. */ 1054 if (drain_urgent_first && main_entryq->urgency > 0) 1055 drain_limit_hit = TRUE; 1056 } 1057 1058 if (!drain_limit_hit) { 1059 /* Pull from local runq */ 1060 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 1061 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE, 1062 MACH_MULTIQ_GROUP, main_entryq->highq, group->runq.highq, 0, 0); 1063 1064 return sched_group_dequeue_thread(main_entryq, group); 1065 } 1066 } 1067 1068 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 1069 MACHDBG_CODE(DBG_MACH_SCHED, MACH_MULTIQ_DEQUEUE) | DBG_FUNC_NONE, 1070 MACH_MULTIQ_GLOBAL, main_entryq->highq, group->runq.highq, 0, 0); 1071 1072 /* Couldn't pull from local runq, pull from global runq instead */ 1073 if (deep_drain) { 1074 return sched_global_deep_drain_dequeue_thread(main_entryq); 1075 } else { 1076 return sched_global_dequeue_thread(main_entryq); 1077 } 1078} 1079 1080 1081/* 1082 * Thread must be locked, and not already be on a run queue. 1083 * pset is locked. 1084 */ 1085static boolean_t 1086sched_multiq_processor_enqueue( 1087 processor_t processor, 1088 thread_t thread, 1089 integer_t options) 1090{ 1091 boolean_t result; 1092 1093 assert(processor == thread->chosen_processor); 1094 1095 if (thread->bound_processor != PROCESSOR_NULL) { 1096 assert(thread->bound_processor == processor); 1097 1098 result = run_queue_enqueue(multiq_bound_runq(processor), thread, options); 1099 thread->runq = processor; 1100 1101 return result; 1102 } 1103 1104 sched_group_enqueue_thread(multiq_main_entryq(processor), 1105 thread->sched_group, 1106 thread, options); 1107 1108 thread->runq = processor; 1109 1110 return (FALSE); 1111} 1112 1113/* 1114 * Called in the context of thread with thread and pset unlocked, 1115 * after updating thread priority but before propagating that priority 1116 * to the processor 1117 */ 1118void 1119sched_multiq_quantum_expire(thread_t thread) 1120{ 1121 if (deep_drain) { 1122 /* 1123 * Move the entry at this priority to the end of the queue, 1124 * to allow the next task a shot at running. 1125 */ 1126 1127 processor_t processor = thread->last_processor; 1128 processor_set_t pset = processor->processor_set; 1129 entry_queue_t entryq = multiq_main_entryq(processor); 1130 1131 pset_lock(pset); 1132 1133 sched_entry_t entry = group_entry_for_pri(thread->sched_group, processor->current_pri); 1134 1135 if (entry->runq == MULTIQ_ERUNQ) { 1136 entry_queue_remove_entry(entryq, entry); 1137 entry_queue_enqueue_entry(entryq, entry, SCHED_TAILQ); 1138 } 1139 1140 pset_unlock(pset); 1141 } 1142} 1143 1144static boolean_t 1145sched_multiq_processor_queue_empty(processor_t processor) 1146{ 1147 return multiq_main_entryq(processor)->count == 0 && 1148 multiq_bound_runq(processor)->count == 0; 1149} 1150 1151static ast_t 1152sched_multiq_processor_csw_check(processor_t processor) 1153{ 1154 boolean_t has_higher; 1155 int pri; 1156 1157 entry_queue_t main_entryq = multiq_main_entryq(processor); 1158 run_queue_t bound_runq = multiq_bound_runq(processor); 1159 1160 assert(processor->active_thread != NULL); 1161 1162 pri = MAX(main_entryq->highq, bound_runq->highq); 1163 1164 if (first_timeslice(processor)) { 1165 has_higher = (pri > processor->current_pri); 1166 } else { 1167 has_higher = (pri >= processor->current_pri); 1168 } 1169 1170 if (has_higher) { 1171 if (main_entryq->urgency > 0) 1172 return (AST_PREEMPT | AST_URGENT); 1173 1174 if (bound_runq->urgency > 0) 1175 return (AST_PREEMPT | AST_URGENT); 1176 1177 if (processor->active_thread && thread_eager_preemption(processor->active_thread)) 1178 return (AST_PREEMPT | AST_URGENT); 1179 1180 return AST_PREEMPT; 1181 } 1182 1183 return AST_NONE; 1184} 1185 1186static boolean_t 1187sched_multiq_processor_queue_has_priority( 1188 processor_t processor, 1189 int priority, 1190 boolean_t gte) 1191{ 1192 int qpri = MAX(multiq_main_entryq(processor)->highq, multiq_bound_runq(processor)->highq); 1193 1194 if (gte) 1195 return qpri >= priority; 1196 else 1197 return qpri > priority; 1198} 1199 1200static boolean_t 1201sched_multiq_should_current_thread_rechoose_processor(processor_t processor) 1202{ 1203 return (processor->current_pri < BASEPRI_RTQUEUES && processor->processor_primary != processor); 1204} 1205 1206static int 1207sched_multiq_runq_count(processor_t processor) 1208{ 1209 /* 1210 * TODO: Decide whether to keep a count of runnable threads in the pset 1211 * or just return something less than the true count. 1212 * 1213 * This needs to be fast, so no iterating the whole runq. 1214 * 1215 * Another possible decision is to remove this - with global runq 1216 * it doesn't make much sense. 1217 */ 1218 return multiq_main_entryq(processor)->count + multiq_bound_runq(processor)->count; 1219} 1220 1221static uint64_t 1222sched_multiq_runq_stats_count_sum(processor_t processor) 1223{ 1224 /* 1225 * TODO: This one does need to go through all the runqueues, but it's only needed for 1226 * the sched stats tool 1227 */ 1228 1229 uint64_t bound_sum = multiq_bound_runq(processor)->runq_stats.count_sum; 1230 1231 if (processor->cpu_id == processor->processor_set->cpu_set_low) 1232 return bound_sum + multiq_main_entryq(processor)->runq_stats.count_sum; 1233 else 1234 return bound_sum; 1235} 1236 1237static int 1238sched_multiq_processor_bound_count(processor_t processor) 1239{ 1240 return multiq_bound_runq(processor)->count; 1241} 1242 1243static void 1244sched_multiq_processor_queue_shutdown(processor_t processor) 1245{ 1246 processor_set_t pset = processor->processor_set; 1247 entry_queue_t main_entryq = multiq_main_entryq(processor); 1248 thread_t thread; 1249 queue_head_t tqueue; 1250 1251 /* We only need to migrate threads if this is the last active processor in the pset */ 1252 if (pset->online_processor_count > 0) { 1253 pset_unlock(pset); 1254 return; 1255 } 1256 1257 queue_init(&tqueue); 1258 1259 /* Note that we do not remove bound threads from the queues here */ 1260 1261 while (main_entryq->count > 0) { 1262 thread = sched_global_dequeue_thread(main_entryq); 1263 enqueue_tail(&tqueue, (queue_entry_t)thread); 1264 } 1265 1266 pset_unlock(pset); 1267 1268 while ((thread = (thread_t)(void*)dequeue_head(&tqueue)) != THREAD_NULL) { 1269 thread_lock(thread); 1270 1271 thread_setrun(thread, SCHED_TAILQ); 1272 1273 thread_unlock(thread); 1274 } 1275} 1276 1277/* 1278 * Thread is locked 1279 * 1280 * This is why we can never read sched_pri unless we have the thread locked. 1281 * Which we do in the enqueue and remove cases, but not the dequeue case. 1282 */ 1283static boolean_t 1284sched_multiq_processor_queue_remove( 1285 processor_t processor, 1286 thread_t thread) 1287{ 1288 boolean_t removed = FALSE; 1289 1290 processor_set_t pset = processor->processor_set; 1291 1292 pset_lock(pset); 1293 1294 if (thread->runq != PROCESSOR_NULL) { 1295 /* 1296 * Thread is on a run queue and we have a lock on 1297 * that run queue. 1298 */ 1299 1300 assert(thread->runq == processor); 1301 1302 if (thread->bound_processor != PROCESSOR_NULL) { 1303 assert(processor == thread->bound_processor); 1304 run_queue_remove(multiq_bound_runq(processor), thread); 1305 thread->runq = PROCESSOR_NULL; 1306 } else { 1307 sched_group_remove_thread(multiq_main_entryq(processor), 1308 thread->sched_group, 1309 thread); 1310 } 1311 1312 removed = TRUE; 1313 } 1314 1315 pset_unlock(pset); 1316 1317 return removed; 1318} 1319 1320/* pset is locked, returned unlocked */ 1321static thread_t 1322sched_multiq_steal_thread(processor_set_t pset) 1323{ 1324 pset_unlock(pset); 1325 return (THREAD_NULL); 1326} 1327 1328/* 1329 * Scan the global queue for candidate groups, and scan those groups for 1330 * candidate threads. 1331 * 1332 * Returns TRUE if retry is needed. 1333 */ 1334static boolean_t 1335group_scan(entry_queue_t runq) { 1336 int count; 1337 queue_t q; 1338 sched_group_t group; 1339 sched_entry_t entry; 1340 1341 if ((count = runq->count) > 0) { 1342 q = runq->queues + runq->highq; 1343 while (count > 0) { 1344 queue_iterate(q, entry, sched_entry_t, links) { 1345 group = group_for_entry(entry); 1346 if (group->runq.count > 0) { 1347 if (runq_scan(&group->runq)) 1348 return (TRUE); 1349 } 1350 count--; 1351 } 1352 q--; 1353 } 1354 } 1355 1356 return (FALSE); 1357} 1358 1359static void 1360sched_multiq_thread_update_scan(void) 1361{ 1362 boolean_t restart_needed = FALSE; 1363 processor_t processor = processor_list; 1364 processor_set_t pset; 1365 thread_t thread; 1366 spl_t s; 1367 1368 /* 1369 * We update the threads associated with each processor (bound and idle threads) 1370 * and then update the threads in each pset runqueue. 1371 */ 1372 1373 do { 1374 do { 1375 pset = processor->processor_set; 1376 1377 s = splsched(); 1378 pset_lock(pset); 1379 1380 restart_needed = runq_scan(multiq_bound_runq(processor)); 1381 1382 pset_unlock(pset); 1383 splx(s); 1384 1385 if (restart_needed) 1386 break; 1387 1388 thread = processor->idle_thread; 1389 if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) { 1390 if (thread_update_add_thread(thread) == FALSE) { 1391 restart_needed = TRUE; 1392 break; 1393 } 1394 } 1395 } while ((processor = processor->processor_list) != NULL); 1396 1397 /* Ok, we now have a collection of candidates -- fix them. */ 1398 thread_update_process_threads(); 1399 1400 } while (restart_needed); 1401 1402 pset = &pset0; 1403 1404 do { 1405 do { 1406 s = splsched(); 1407 pset_lock(pset); 1408 1409 restart_needed = group_scan(&pset->pset_runq); 1410 1411 pset_unlock(pset); 1412 splx(s); 1413 1414 if (restart_needed) 1415 break; 1416 } while ((pset = pset->pset_list) != NULL); 1417 1418 /* Ok, we now have a collection of candidates -- fix them. */ 1419 thread_update_process_threads(); 1420 1421 } while (restart_needed); 1422} 1423 1424 1425