1/* 2 * Copyright (c) 2013 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28#include <mach/mach_types.h> 29#include <kern/assert.h> 30#include <kern/clock.h> 31#include <kern/debug.h> 32#include <kern/host.h> 33#include <kern/kalloc.h> 34#include <kern/kern_types.h> 35#include <kern/machine.h> 36#include <kern/simple_lock.h> 37#include <kern/misc_protos.h> 38#include <kern/sched.h> 39#include <kern/sched_prim.h> 40#include <kern/sfi.h> 41#include <kern/timer_call.h> 42#include <kern/wait_queue.h> 43#include <kern/ledger.h> 44#include <pexpert/pexpert.h> 45 46#include <libkern/kernel_mach_header.h> 47 48#include <sys/kdebug.h> 49 50#define SFI_DEBUG 0 51 52#if SFI_DEBUG 53#define dprintf(...) kprintf(__VA_ARGS__) 54#else 55#define dprintf(...) do { } while(0) 56#endif 57 58#ifdef MACH_BSD 59extern sched_call_t workqueue_get_sched_callback(void); 60#endif /* MACH_BSD */ 61 62/* 63 * SFI (Selective Forced Idle) operates by enabling a global 64 * timer on the SFI window interval. When it fires, all processors 65 * running a thread that should be SFI-ed are sent an AST. 66 * As threads become runnable while in their "off phase", they 67 * are placed on a deferred ready queue. When a per-class 68 * "on timer" fires, the ready threads for that class are 69 * re-enqueued for running. As an optimization to avoid spurious 70 * wakeups, the timer may be lazily programmed. 71 */ 72 73/* 74 * The "sfi_lock" simple lock guards access to static configuration 75 * parameters (as specified by userspace), dynamic state changes 76 * (as updated by the timer event routine), and timer data structures. 77 * Since it can be taken with interrupts disabled in some cases, all 78 * uses should be taken with interrupts disabled at splsched(). The 79 * "sfi_lock" also guards the "sfi_wait_class" field of thread_t, and 80 * must only be accessed with it held. 81 * 82 * When an "on timer" fires, we must deterministically be able to drain 83 * the wait queue, since if any threads are added to the queue afterwards, 84 * they may never get woken out of SFI wait. So sfi_lock must be 85 * taken before the wait queue's own spinlock. 86 * 87 * The wait queue will take the thread's scheduling lock. We may also take 88 * the thread_lock directly to update the "sfi_class" field and determine 89 * if the thread should block in the wait queue, but the lock will be 90 * released before doing so. 91 * 92 * The pset lock may also be taken, but not while any other locks are held. 93 * 94 * splsched ---> sfi_lock ---> wait_queue ---> thread_lock 95 * \ \ \__ thread_lock (*) 96 * \ \__ pset_lock 97 * \ 98 * \__ thread_lock 99 */ 100 101decl_simple_lock_data(static,sfi_lock); 102static timer_call_data_t sfi_timer_call_entry; 103volatile boolean_t sfi_is_enabled; 104 105boolean_t sfi_window_is_set; 106uint64_t sfi_window_usecs; 107uint64_t sfi_window_interval; 108uint64_t sfi_next_off_deadline; 109 110typedef struct { 111 sfi_class_id_t class_id; 112 thread_continue_t class_continuation; 113 const char * class_name; 114 const char * class_ledger_name; 115} sfi_class_registration_t; 116 117/* 118 * To add a new SFI class: 119 * 120 * 1) Raise MAX_SFI_CLASS_ID in mach/sfi_class.h 121 * 2) Add a #define for it to mach/sfi_class.h. It need not be inserted in order of restrictiveness. 122 * 3) Add a call to SFI_CLASS_REGISTER below 123 * 4) Augment sfi_thread_classify to categorize threads as early as possible for as restrictive as possible. 124 * 5) Modify thermald to use the SFI class 125 */ 126 127static inline void _sfi_wait_cleanup(sched_call_t callback); 128 129#define SFI_CLASS_REGISTER(class_id, ledger_name) \ 130extern char compile_time_assert_ ## class_id[SFI_CLASS_ ## class_id < MAX_SFI_CLASS_ID ? 1 : -1]; \ 131void __attribute__((noinline,noreturn)) SFI_ ## class_id ## _THREAD_IS_WAITING(void *callback, wait_result_t wret __unused); \ 132void SFI_ ## class_id ## _THREAD_IS_WAITING(void *callback, wait_result_t wret __unused) \ 133{ \ 134 _sfi_wait_cleanup(callback); \ 135 thread_exception_return(); \ 136} \ 137 \ 138sfi_class_registration_t SFI_ ## class_id ## _registration __attribute__((section("__DATA,__sfi_class_reg"),used)) = { SFI_CLASS_ ## class_id, SFI_ ## class_id ## _THREAD_IS_WAITING, "SFI_CLASS_" # class_id, "SFI_CLASS_" # ledger_name }; 139 140/* SFI_CLASS_UNSPECIFIED not included here */ 141SFI_CLASS_REGISTER(MAINTENANCE, MAINTENANCE) 142SFI_CLASS_REGISTER(DARWIN_BG, DARWIN_BG) 143SFI_CLASS_REGISTER(APP_NAP, APP_NAP) 144SFI_CLASS_REGISTER(MANAGED_FOCAL, MANAGED) 145SFI_CLASS_REGISTER(MANAGED_NONFOCAL, MANAGED) 146SFI_CLASS_REGISTER(UTILITY, UTILITY) 147SFI_CLASS_REGISTER(DEFAULT_FOCAL, DEFAULT) 148SFI_CLASS_REGISTER(DEFAULT_NONFOCAL, DEFAULT) 149SFI_CLASS_REGISTER(LEGACY_FOCAL, LEGACY) 150SFI_CLASS_REGISTER(LEGACY_NONFOCAL, LEGACY) 151SFI_CLASS_REGISTER(USER_INITIATED_FOCAL, USER_INITIATED) 152SFI_CLASS_REGISTER(USER_INITIATED_NONFOCAL, USER_INITIATED) 153SFI_CLASS_REGISTER(USER_INTERACTIVE_FOCAL, USER_INTERACTIVE) 154SFI_CLASS_REGISTER(USER_INTERACTIVE_NONFOCAL, USER_INTERACTIVE) 155SFI_CLASS_REGISTER(KERNEL, OPTED_OUT) 156SFI_CLASS_REGISTER(OPTED_OUT, OPTED_OUT) 157 158struct sfi_class_state { 159 uint64_t off_time_usecs; 160 uint64_t off_time_interval; 161 162 timer_call_data_t on_timer; 163 boolean_t on_timer_programmed; 164 165 boolean_t class_sfi_is_enabled; 166 volatile boolean_t class_in_on_phase; 167 168 struct wait_queue wait_queue; /* threads in ready state */ 169 thread_continue_t continuation; 170 171 const char * class_name; 172 const char * class_ledger_name; 173}; 174 175/* Static configuration performed in sfi_early_init() */ 176struct sfi_class_state sfi_classes[MAX_SFI_CLASS_ID]; 177 178int sfi_enabled_class_count; 179 180static void sfi_timer_global_off( 181 timer_call_param_t param0, 182 timer_call_param_t param1); 183 184static void sfi_timer_per_class_on( 185 timer_call_param_t param0, 186 timer_call_param_t param1); 187 188static sfi_class_registration_t * 189sfi_get_registration_data(unsigned long *count) 190{ 191 unsigned long sectlen = 0; 192 void *sectdata; 193 194 sectdata = getsectdatafromheader(&_mh_execute_header, "__DATA", "__sfi_class_reg", §len); 195 if (sectdata) { 196 197 if (sectlen % sizeof(sfi_class_registration_t) != 0) { 198 /* corrupt data? */ 199 panic("__sfi_class_reg section has invalid size %lu", sectlen); 200 __builtin_unreachable(); 201 } 202 203 *count = sectlen / sizeof(sfi_class_registration_t); 204 return (sfi_class_registration_t *)sectdata; 205 } else { 206 panic("__sfi_class_reg section not found"); 207 __builtin_unreachable(); 208 } 209} 210 211/* Called early in boot, when kernel is single-threaded */ 212void sfi_early_init(void) 213{ 214 unsigned long i, count; 215 sfi_class_registration_t *registrations; 216 217 registrations = sfi_get_registration_data(&count); 218 for (i=0; i < count; i++) { 219 sfi_class_id_t class_id = registrations[i].class_id; 220 221 assert(class_id < MAX_SFI_CLASS_ID); /* should be caught at compile-time */ 222 if (class_id < MAX_SFI_CLASS_ID) { 223 if (sfi_classes[class_id].continuation != NULL) { 224 panic("Duplicate SFI registration for class 0x%x", class_id); 225 } 226 sfi_classes[class_id].class_sfi_is_enabled = FALSE; 227 sfi_classes[class_id].class_in_on_phase = TRUE; 228 sfi_classes[class_id].continuation = registrations[i].class_continuation; 229 sfi_classes[class_id].class_name = registrations[i].class_name; 230 sfi_classes[class_id].class_ledger_name = registrations[i].class_ledger_name; 231 } 232 } 233} 234 235void sfi_init(void) 236{ 237 sfi_class_id_t i; 238 kern_return_t kret; 239 240 simple_lock_init(&sfi_lock, 0); 241 timer_call_setup(&sfi_timer_call_entry, sfi_timer_global_off, NULL); 242 sfi_window_is_set = FALSE; 243 sfi_enabled_class_count = 0; 244 sfi_is_enabled = FALSE; 245 246 for (i = 0; i < MAX_SFI_CLASS_ID; i++) { 247 /* If the class was set up in sfi_early_init(), initialize remaining fields */ 248 if (sfi_classes[i].continuation) { 249 timer_call_setup(&sfi_classes[i].on_timer, sfi_timer_per_class_on, (void *)(uintptr_t)i); 250 sfi_classes[i].on_timer_programmed = FALSE; 251 252 kret = wait_queue_init(&sfi_classes[i].wait_queue, SYNC_POLICY_FIFO); 253 assert(kret == KERN_SUCCESS); 254 } else { 255 /* The only allowed gap is for SFI_CLASS_UNSPECIFIED */ 256 if(i != SFI_CLASS_UNSPECIFIED) { 257 panic("Gap in registered SFI classes"); 258 } 259 } 260 } 261} 262 263/* Can be called before sfi_init() by task initialization, but after sfi_early_init() */ 264sfi_class_id_t 265sfi_get_ledger_alias_for_class(sfi_class_id_t class_id) 266{ 267 sfi_class_id_t i; 268 const char *ledger_name = NULL; 269 270 ledger_name = sfi_classes[class_id].class_ledger_name; 271 272 /* Find the first class in the registration table with this ledger name */ 273 if (ledger_name) { 274 for (i = SFI_CLASS_UNSPECIFIED + 1; i < class_id; i++) { 275 if (0 == strcmp(sfi_classes[i].class_ledger_name, ledger_name)) { 276 dprintf("sfi_get_ledger_alias_for_class(0x%x) -> 0x%x\n", class_id, i); 277 return i; 278 } 279 } 280 281 /* This class is the primary one for the ledger, so there is no alias */ 282 dprintf("sfi_get_ledger_alias_for_class(0x%x) -> 0x%x\n", class_id, SFI_CLASS_UNSPECIFIED); 283 return SFI_CLASS_UNSPECIFIED; 284 } 285 286 /* We are permissive on SFI class lookup failures. In sfi_init(), we assert more */ 287 return SFI_CLASS_UNSPECIFIED; 288} 289 290int 291sfi_ledger_entry_add(ledger_template_t template, sfi_class_id_t class_id) 292{ 293 const char *ledger_name = NULL; 294 295 ledger_name = sfi_classes[class_id].class_ledger_name; 296 297 dprintf("sfi_ledger_entry_add(%p, 0x%x) -> %s\n", template, class_id, ledger_name); 298 return ledger_entry_add(template, ledger_name, "sfi", "MATUs"); 299} 300 301static void sfi_timer_global_off( 302 timer_call_param_t param0 __unused, 303 timer_call_param_t param1 __unused) 304{ 305 uint64_t now = mach_absolute_time(); 306 sfi_class_id_t i; 307 processor_set_t pset, nset; 308 processor_t processor; 309 uint32_t needs_cause_ast_mask = 0x0; 310 spl_t s; 311 312 s = splsched(); 313 314 simple_lock(&sfi_lock); 315 if (!sfi_is_enabled) { 316 /* If SFI has been disabled, let all "on" timers drain naturally */ 317 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_OFF_TIMER) | DBG_FUNC_NONE, 1, 0, 0, 0, 0); 318 319 simple_unlock(&sfi_lock); 320 splx(s); 321 return; 322 } 323 324 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_OFF_TIMER) | DBG_FUNC_START, 0, 0, 0, 0, 0); 325 326 /* First set all configured classes into the off state, and program their "on" timer */ 327 for (i = 0; i < MAX_SFI_CLASS_ID; i++) { 328 if (sfi_classes[i].class_sfi_is_enabled) { 329 uint64_t on_timer_deadline; 330 331 sfi_classes[i].class_in_on_phase = FALSE; 332 sfi_classes[i].on_timer_programmed = TRUE; 333 334 /* Push out on-timer */ 335 on_timer_deadline = now + sfi_classes[i].off_time_interval; 336 timer_call_enter1(&sfi_classes[i].on_timer, NULL, on_timer_deadline, TIMER_CALL_SYS_CRITICAL); 337 } else { 338 /* If this class no longer needs SFI, make sure the timer is cancelled */ 339 sfi_classes[i].class_in_on_phase = TRUE; 340 if (sfi_classes[i].on_timer_programmed) { 341 sfi_classes[i].on_timer_programmed = FALSE; 342 timer_call_cancel(&sfi_classes[i].on_timer); 343 } 344 } 345 } 346 simple_unlock(&sfi_lock); 347 348 /* Iterate over processors, call cause_ast_check() on ones running a thread that should be in an off phase */ 349 processor = processor_list; 350 pset = processor->processor_set; 351 352 pset_lock(pset); 353 354 do { 355 nset = processor->processor_set; 356 if (nset != pset) { 357 pset_unlock(pset); 358 pset = nset; 359 pset_lock(pset); 360 } 361 362 /* "processor" and its pset are locked */ 363 if (processor->state == PROCESSOR_RUNNING) { 364 if (AST_NONE != sfi_processor_needs_ast(processor)) { 365 needs_cause_ast_mask |= (1U << processor->cpu_id); 366 } 367 } 368 } while ((processor = processor->processor_list) != NULL); 369 370 pset_unlock(pset); 371 372 processor = processor_list; 373 do { 374 if (needs_cause_ast_mask & (1U << processor->cpu_id)) { 375 if (processor == current_processor()) 376 ast_on(AST_SFI); 377 else 378 cause_ast_check(processor); 379 } 380 } while ((processor = processor->processor_list) != NULL); 381 382 /* Re-arm timer if still enabled */ 383 simple_lock(&sfi_lock); 384 if (sfi_is_enabled) { 385 clock_deadline_for_periodic_event(sfi_window_interval, 386 now, 387 &sfi_next_off_deadline); 388 timer_call_enter1(&sfi_timer_call_entry, 389 NULL, 390 sfi_next_off_deadline, 391 TIMER_CALL_SYS_CRITICAL); 392 } 393 394 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_OFF_TIMER) | DBG_FUNC_END, 0, 0, 0, 0, 0); 395 396 simple_unlock(&sfi_lock); 397 398 splx(s); 399} 400 401static void sfi_timer_per_class_on( 402 timer_call_param_t param0, 403 timer_call_param_t param1 __unused) 404{ 405 sfi_class_id_t sfi_class_id = (sfi_class_id_t)(uintptr_t)param0; 406 struct sfi_class_state *sfi_class = &sfi_classes[sfi_class_id]; 407 kern_return_t kret; 408 spl_t s; 409 410 s = splsched(); 411 412 simple_lock(&sfi_lock); 413 414 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_ON_TIMER) | DBG_FUNC_START, sfi_class_id, 0, 0, 0, 0); 415 416 /* 417 * Any threads that may have accumulated in the ready queue for this class should get re-enqueued. 418 * Since we have the sfi_lock held and have changed "class_in_on_phase", we expect 419 * no new threads to be put on this wait queue until the global "off timer" has fired. 420 */ 421 sfi_class->class_in_on_phase = TRUE; 422 kret = wait_queue_wakeup64_all(&sfi_class->wait_queue, 423 CAST_EVENT64_T(sfi_class_id), 424 THREAD_AWAKENED); 425 assert(kret == KERN_SUCCESS || kret == KERN_NOT_WAITING); 426 427 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_ON_TIMER) | DBG_FUNC_END, 0, 0, 0, 0, 0); 428 429 simple_unlock(&sfi_lock); 430 431 splx(s); 432} 433 434 435kern_return_t sfi_set_window(uint64_t window_usecs) 436{ 437 uint64_t interval, deadline; 438 uint64_t now = mach_absolute_time(); 439 sfi_class_id_t i; 440 spl_t s; 441 uint64_t largest_class_off_interval = 0; 442 443 if (window_usecs < MIN_SFI_WINDOW_USEC) 444 window_usecs = MIN_SFI_WINDOW_USEC; 445 446 if (window_usecs > UINT32_MAX) 447 return (KERN_INVALID_ARGUMENT); 448 449 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_SET_WINDOW), window_usecs, 0, 0, 0, 0); 450 451 clock_interval_to_absolutetime_interval((uint32_t)window_usecs, NSEC_PER_USEC, &interval); 452 deadline = now + interval; 453 454 s = splsched(); 455 456 simple_lock(&sfi_lock); 457 458 /* Check that we are not bringing in the SFI window smaller than any class */ 459 for (i = 0; i < MAX_SFI_CLASS_ID; i++) { 460 if (sfi_classes[i].class_sfi_is_enabled) { 461 largest_class_off_interval = MAX(largest_class_off_interval, sfi_classes[i].off_time_interval); 462 } 463 } 464 465 /* 466 * Off window must be strictly greater than all enabled classes, 467 * otherwise threads would build up on ready queue and never be able to run. 468 */ 469 if (interval <= largest_class_off_interval) { 470 simple_unlock(&sfi_lock); 471 splx(s); 472 return (KERN_INVALID_ARGUMENT); 473 } 474 475 /* 476 * If the new "off" deadline is further out than the current programmed timer, 477 * just let the current one expire (and the new cadence will be established thereafter). 478 * If the new "off" deadline is nearer than the current one, bring it in, so we 479 * can start the new behavior sooner. Note that this may cause the "off" timer to 480 * fire before some of the class "on" timers have fired. 481 */ 482 sfi_window_usecs = window_usecs; 483 sfi_window_interval = interval; 484 sfi_window_is_set = TRUE; 485 486 if (sfi_enabled_class_count == 0) { 487 /* Can't program timer yet */ 488 } else if (!sfi_is_enabled) { 489 sfi_is_enabled = TRUE; 490 sfi_next_off_deadline = deadline; 491 timer_call_enter1(&sfi_timer_call_entry, 492 NULL, 493 sfi_next_off_deadline, 494 TIMER_CALL_SYS_CRITICAL); 495 } else if (deadline >= sfi_next_off_deadline) { 496 sfi_next_off_deadline = deadline; 497 } else { 498 sfi_next_off_deadline = deadline; 499 timer_call_enter1(&sfi_timer_call_entry, 500 NULL, 501 sfi_next_off_deadline, 502 TIMER_CALL_SYS_CRITICAL); 503 } 504 505 simple_unlock(&sfi_lock); 506 splx(s); 507 508 return (KERN_SUCCESS); 509} 510 511kern_return_t sfi_window_cancel(void) 512{ 513 spl_t s; 514 515 s = splsched(); 516 517 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_CANCEL_WINDOW), 0, 0, 0, 0, 0); 518 519 /* Disable globals so that global "off-timer" is not re-armed */ 520 simple_lock(&sfi_lock); 521 sfi_window_is_set = FALSE; 522 sfi_window_usecs = 0; 523 sfi_window_interval = 0; 524 sfi_next_off_deadline = 0; 525 sfi_is_enabled = FALSE; 526 simple_unlock(&sfi_lock); 527 528 splx(s); 529 530 return (KERN_SUCCESS); 531} 532 533 534kern_return_t sfi_get_window(uint64_t *window_usecs) 535{ 536 spl_t s; 537 uint64_t off_window_us; 538 539 s = splsched(); 540 simple_lock(&sfi_lock); 541 542 off_window_us = sfi_window_usecs; 543 544 simple_unlock(&sfi_lock); 545 splx(s); 546 547 *window_usecs = off_window_us; 548 549 return (KERN_SUCCESS); 550} 551 552 553kern_return_t sfi_set_class_offtime(sfi_class_id_t class_id, uint64_t offtime_usecs) 554{ 555 uint64_t interval; 556 spl_t s; 557 uint64_t off_window_interval; 558 559 if (offtime_usecs < MIN_SFI_WINDOW_USEC) 560 offtime_usecs = MIN_SFI_WINDOW_USEC; 561 562 if (class_id == SFI_CLASS_UNSPECIFIED || class_id >= MAX_SFI_CLASS_ID) 563 return (KERN_INVALID_ARGUMENT); 564 565 if (offtime_usecs > UINT32_MAX) 566 return (KERN_INVALID_ARGUMENT); 567 568 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_SET_CLASS_OFFTIME), offtime_usecs, class_id, 0, 0, 0); 569 570 clock_interval_to_absolutetime_interval((uint32_t)offtime_usecs, NSEC_PER_USEC, &interval); 571 572 s = splsched(); 573 574 simple_lock(&sfi_lock); 575 off_window_interval = sfi_window_interval; 576 577 /* Check that we are not bringing in class off-time larger than the SFI window */ 578 if (off_window_interval && (interval >= off_window_interval)) { 579 simple_unlock(&sfi_lock); 580 splx(s); 581 return (KERN_INVALID_ARGUMENT); 582 } 583 584 /* We never re-program the per-class on-timer, but rather just let it expire naturally */ 585 if (!sfi_classes[class_id].class_sfi_is_enabled) { 586 sfi_enabled_class_count++; 587 } 588 sfi_classes[class_id].off_time_usecs = offtime_usecs; 589 sfi_classes[class_id].off_time_interval = interval; 590 sfi_classes[class_id].class_sfi_is_enabled = TRUE; 591 592 if (sfi_window_is_set && !sfi_is_enabled) { 593 /* start global off timer */ 594 sfi_is_enabled = TRUE; 595 sfi_next_off_deadline = mach_absolute_time() + sfi_window_interval; 596 timer_call_enter1(&sfi_timer_call_entry, 597 NULL, 598 sfi_next_off_deadline, 599 TIMER_CALL_SYS_CRITICAL); 600 } 601 602 simple_unlock(&sfi_lock); 603 604 splx(s); 605 606 return (KERN_SUCCESS); 607} 608 609kern_return_t sfi_class_offtime_cancel(sfi_class_id_t class_id) 610{ 611 spl_t s; 612 613 if (class_id == SFI_CLASS_UNSPECIFIED || class_id >= MAX_SFI_CLASS_ID) 614 return (KERN_INVALID_ARGUMENT); 615 616 s = splsched(); 617 618 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_CANCEL_CLASS_OFFTIME), class_id, 0, 0, 0, 0); 619 620 simple_lock(&sfi_lock); 621 622 /* We never re-program the per-class on-timer, but rather just let it expire naturally */ 623 if (sfi_classes[class_id].class_sfi_is_enabled) { 624 sfi_enabled_class_count--; 625 } 626 sfi_classes[class_id].off_time_usecs = 0; 627 sfi_classes[class_id].off_time_interval = 0; 628 sfi_classes[class_id].class_sfi_is_enabled = FALSE; 629 630 if (sfi_enabled_class_count == 0) { 631 sfi_is_enabled = FALSE; 632 } 633 634 simple_unlock(&sfi_lock); 635 636 splx(s); 637 638 return (KERN_SUCCESS); 639} 640 641kern_return_t sfi_get_class_offtime(sfi_class_id_t class_id, uint64_t *offtime_usecs) 642{ 643 uint64_t off_time_us; 644 spl_t s; 645 646 if (class_id == SFI_CLASS_UNSPECIFIED || class_id >= MAX_SFI_CLASS_ID) 647 return (0); 648 649 s = splsched(); 650 651 simple_lock(&sfi_lock); 652 off_time_us = sfi_classes[class_id].off_time_usecs; 653 simple_unlock(&sfi_lock); 654 655 splx(s); 656 657 *offtime_usecs = off_time_us; 658 659 return (KERN_SUCCESS); 660} 661 662/* 663 * sfi_thread_classify and sfi_processor_active_thread_classify perform the critical 664 * role of quickly categorizing a thread into its SFI class so that an AST_SFI can be 665 * set. As the thread is unwinding to userspace, sfi_ast() performs full locking 666 * and determines whether the thread should enter an SFI wait state. Because of 667 * the inherent races between the time the AST is set and when it is evaluated, 668 * thread classification can be inaccurate (but should always be safe). This is 669 * especially the case for sfi_processor_active_thread_classify, which must 670 * classify the active thread on a remote processor without taking the thread lock. 671 * When in doubt, classification should err on the side of *not* classifying a 672 * thread at all, and wait for the thread itself to either hit a quantum expiration 673 * or block inside the kernel. 674 */ 675 676/* 677 * Thread must be locked. Ultimately, the real decision to enter 678 * SFI wait happens at the AST boundary. 679 */ 680sfi_class_id_t sfi_thread_classify(thread_t thread) 681{ 682 task_t task = thread->task; 683 boolean_t is_kernel_thread = (task == kernel_task); 684 sched_mode_t thmode = thread->sched_mode; 685 int latency_qos = proc_get_effective_task_policy(task, TASK_POLICY_LATENCY_QOS); 686 int task_role = proc_get_effective_task_policy(task, TASK_POLICY_ROLE); 687 int thread_bg = proc_get_effective_thread_policy(thread, TASK_POLICY_DARWIN_BG); 688 int managed_task = proc_get_effective_task_policy(task, TASK_POLICY_SFI_MANAGED); 689 int thread_qos = proc_get_effective_thread_policy(thread, TASK_POLICY_QOS); 690 691 /* kernel threads never reach the user AST boundary, and are in a separate world for SFI */ 692 if (is_kernel_thread) { 693 return SFI_CLASS_KERNEL; 694 } 695 696 if (thread_qos == THREAD_QOS_MAINTENANCE) 697 return SFI_CLASS_MAINTENANCE; 698 699 if (thread_bg || thread_qos == THREAD_QOS_BACKGROUND) { 700 return SFI_CLASS_DARWIN_BG; 701 } 702 703 if (latency_qos != 0) { 704 int latency_qos_wtf = latency_qos - 1; 705 706 if ((latency_qos_wtf >= 4) && (latency_qos_wtf <= 5)) { 707 return SFI_CLASS_APP_NAP; 708 } 709 } 710 711 /* 712 * Realtime and fixed priority threads express their duty cycle constraints 713 * via other mechanisms, and are opted out of (most) forms of SFI 714 */ 715 if (thmode == TH_MODE_REALTIME || thmode == TH_MODE_FIXED || task_role == TASK_GRAPHICS_SERVER) { 716 return SFI_CLASS_OPTED_OUT; 717 } 718 719 /* 720 * Threads with unspecified or legacy QOS class can be individually managed 721 */ 722 if (managed_task && 723 (thread_qos == THREAD_QOS_UNSPECIFIED || thread_qos == THREAD_QOS_LEGACY)) { 724 if (task_role == TASK_FOREGROUND_APPLICATION || task_role == TASK_CONTROL_APPLICATION) 725 return SFI_CLASS_MANAGED_FOCAL; 726 else 727 return SFI_CLASS_MANAGED_NONFOCAL; 728 } 729 730 if (thread_qos == THREAD_QOS_UTILITY) 731 return SFI_CLASS_UTILITY; 732 733 if (task_role == TASK_FOREGROUND_APPLICATION || task_role == TASK_CONTROL_APPLICATION) { 734 switch (thread_qos) { 735 case THREAD_QOS_USER_INTERACTIVE: 736 return SFI_CLASS_USER_INTERACTIVE_FOCAL; 737 case THREAD_QOS_USER_INITIATED: 738 return SFI_CLASS_USER_INITIATED_FOCAL; 739 case THREAD_QOS_LEGACY: 740 return SFI_CLASS_LEGACY_FOCAL; 741 default: 742 return SFI_CLASS_DEFAULT_FOCAL; 743 } 744 } else { 745 switch (thread_qos) { 746 case THREAD_QOS_USER_INTERACTIVE: 747 return SFI_CLASS_USER_INTERACTIVE_NONFOCAL; 748 case THREAD_QOS_USER_INITIATED: 749 return SFI_CLASS_USER_INITIATED_NONFOCAL; 750 case THREAD_QOS_LEGACY: 751 return SFI_CLASS_LEGACY_NONFOCAL; 752 default: 753 return SFI_CLASS_DEFAULT_NONFOCAL; 754 } 755 } 756} 757 758/* 759 * pset must be locked. 760 */ 761sfi_class_id_t sfi_processor_active_thread_classify(processor_t processor) 762{ 763 return processor->current_sfi_class; 764} 765 766/* 767 * thread must be locked. This is inherently racy, with the intent that 768 * at the AST boundary, it will be fully evaluated whether we need to 769 * perform an AST wait 770 */ 771ast_t sfi_thread_needs_ast(thread_t thread, sfi_class_id_t *out_class) 772{ 773 sfi_class_id_t class_id; 774 775 class_id = sfi_thread_classify(thread); 776 777 if (out_class) 778 *out_class = class_id; 779 780 /* No lock taken, so a stale value may be used. */ 781 if (!sfi_classes[class_id].class_in_on_phase) 782 return AST_SFI; 783 else 784 return AST_NONE; 785} 786 787/* 788 * pset must be locked. We take the SFI class for 789 * the currently running thread which is cached on 790 * the processor_t, and assume it is accurate. In the 791 * worst case, the processor will get an IPI and be asked 792 * to evaluate if the current running thread at that 793 * later point in time should be in an SFI wait. 794 */ 795ast_t sfi_processor_needs_ast(processor_t processor) 796{ 797 sfi_class_id_t class_id; 798 799 class_id = sfi_processor_active_thread_classify(processor); 800 801 /* No lock taken, so a stale value may be used. */ 802 if (!sfi_classes[class_id].class_in_on_phase) 803 return AST_SFI; 804 else 805 return AST_NONE; 806 807} 808 809static inline void _sfi_wait_cleanup(sched_call_t callback) { 810 thread_t self = current_thread(); 811 sfi_class_id_t current_sfi_wait_class = SFI_CLASS_UNSPECIFIED; 812 int64_t sfi_wait_time, sfi_wait_begin = 0; 813 814 spl_t s = splsched(); 815 thread_lock(self); 816 if (callback) { 817 thread_sched_call(self, callback); 818 } 819 sfi_wait_begin = self->wait_sfi_begin_time; 820 thread_unlock(self); 821 822 simple_lock(&sfi_lock); 823 sfi_wait_time = mach_absolute_time() - sfi_wait_begin; 824 current_sfi_wait_class = self->sfi_wait_class; 825 self->sfi_wait_class = SFI_CLASS_UNSPECIFIED; 826 simple_unlock(&sfi_lock); 827 splx(s); 828 assert(SFI_CLASS_UNSPECIFIED < current_sfi_wait_class < MAX_SFI_CLASS_ID); 829 ledger_credit(self->task->ledger, task_ledgers.sfi_wait_times[current_sfi_wait_class], sfi_wait_time); 830} 831 832/* 833 * Called at AST context to fully evaluate if the current thread 834 * (which is obviously running) should instead block in an SFI wait. 835 * We must take the sfi_lock to check whether we are in the "off" period 836 * for the class, and if so, block. 837 */ 838void sfi_ast(thread_t thread) 839{ 840 sfi_class_id_t class_id; 841 spl_t s; 842 struct sfi_class_state *sfi_class; 843 wait_result_t waitret; 844 boolean_t did_wait = FALSE; 845 uint64_t tid; 846 thread_continue_t continuation; 847 sched_call_t workq_callback = workqueue_get_sched_callback(); 848 boolean_t did_clear_wq = FALSE; 849 850 s = splsched(); 851 852 simple_lock(&sfi_lock); 853 854 if (!sfi_is_enabled) { 855 /* 856 * SFI is not enabled, or has recently been disabled. 857 * There is no point putting this thread on a deferred ready 858 * queue, even if it were classified as needing it, since 859 * SFI will truly be off at the next global off timer 860 */ 861 simple_unlock(&sfi_lock); 862 splx(s); 863 864 return; 865 } 866 867 thread_lock(thread); 868 thread->sfi_class = class_id = sfi_thread_classify(thread); 869 tid = thread_tid(thread); 870 871 /* 872 * Once the sfi_lock is taken and the thread's ->sfi_class field is updated, we 873 * are committed to transitioning to whatever state is indicated by "->class_in_on_phase". 874 * If another thread tries to call sfi_reevaluate() after this point, it will take the 875 * sfi_lock and see the thread in this wait state. If another thread calls 876 * sfi_reevaluate() before this point, it would see a runnable thread and at most 877 * attempt to send an AST to this processor, but we would have the most accurate 878 * classification. 879 */ 880 881 /* Optimistically clear workq callback while thread is already locked */ 882 if (workq_callback && (thread->sched_call == workq_callback)) { 883 thread_sched_call(thread, NULL); 884 did_clear_wq = TRUE; 885 } 886 thread_unlock(thread); 887 888 sfi_class = &sfi_classes[class_id]; 889 if (!sfi_class->class_in_on_phase) { 890 /* Need to block thread in wait queue */ 891 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_THREAD_DEFER), tid, class_id, 0, 0, 0); 892 893 waitret = wait_queue_assert_wait64(&sfi_class->wait_queue, 894 CAST_EVENT64_T(class_id), 895 THREAD_INTERRUPTIBLE, 896 0); 897 if (waitret == THREAD_WAITING) { 898 thread->sfi_wait_class = class_id; 899 did_wait = TRUE; 900 continuation = sfi_class->continuation; 901 } else { 902 /* thread may be exiting already, all other errors are unexpected */ 903 assert(waitret == THREAD_INTERRUPTED); 904 } 905 } 906 simple_unlock(&sfi_lock); 907 908 splx(s); 909 910 if (did_wait) { 911 thread_block_reason(continuation, did_clear_wq ? workq_callback : NULL, AST_SFI); 912 } else { 913 if (did_clear_wq) { 914 s = splsched(); 915 thread_lock(thread); 916 thread_sched_call(thread, workq_callback); 917 thread_unlock(thread); 918 splx(s); 919 } 920 } 921} 922 923/* Thread must be unlocked */ 924void sfi_reevaluate(thread_t thread) 925{ 926 kern_return_t kret; 927 spl_t s; 928 sfi_class_id_t class_id, current_class_id; 929 ast_t sfi_ast; 930 931 s = splsched(); 932 933 simple_lock(&sfi_lock); 934 935 thread_lock(thread); 936 sfi_ast = sfi_thread_needs_ast(thread, &class_id); 937 thread->sfi_class = class_id; 938 939 /* 940 * This routine chiefly exists to boost threads out of an SFI wait 941 * if their classification changes before the "on" timer fires. 942 * 943 * If we calculate that a thread is in a different ->sfi_wait_class 944 * than we think it should be (including no-SFI-wait), we need to 945 * correct that: 946 * 947 * If the thread is in SFI wait and should not be (or should be waiting 948 * on a different class' "on" timer), we wake it up. If needed, the 949 * thread may immediately block again in the different SFI wait state. 950 * 951 * If the thread is not in an SFI wait state and it should be, we need 952 * to get that thread's attention, possibly by sending an AST to another 953 * processor. 954 */ 955 956 if ((current_class_id = thread->sfi_wait_class) != SFI_CLASS_UNSPECIFIED) { 957 958 thread_unlock(thread); /* not needed anymore */ 959 960 assert(current_class_id < MAX_SFI_CLASS_ID); 961 962 if ((sfi_ast == AST_NONE) || (class_id != current_class_id)) { 963 struct sfi_class_state *sfi_class = &sfi_classes[current_class_id]; 964 965 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_WAIT_CANCELED), thread_tid(thread), current_class_id, class_id, 0, 0); 966 967 kret = wait_queue_wakeup64_thread(&sfi_class->wait_queue, 968 CAST_EVENT64_T(current_class_id), 969 thread, 970 THREAD_AWAKENED); 971 assert(kret == KERN_SUCCESS || kret == KERN_NOT_WAITING); 972 } 973 } else { 974 /* 975 * Thread's current SFI wait class is not set, and because we 976 * have the sfi_lock, it won't get set. 977 */ 978 979 if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) { 980 if (sfi_ast != AST_NONE) { 981 if (thread == current_thread()) 982 ast_on(sfi_ast); 983 else { 984 processor_t processor = thread->last_processor; 985 986 if (processor != PROCESSOR_NULL && 987 processor->state == PROCESSOR_RUNNING && 988 processor->active_thread == thread) { 989 cause_ast_check(processor); 990 } else { 991 /* 992 * Runnable thread that's not on a CPU currently. When a processor 993 * does context switch to it, the AST will get set based on whether 994 * the thread is in its "off time". 995 */ 996 } 997 } 998 } 999 } 1000 1001 thread_unlock(thread); 1002 } 1003 1004 simple_unlock(&sfi_lock); 1005 splx(s); 1006} 1007