1/* 2 * Copyright (c) 2012-2013 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28#include <mach/host_priv.h> 29#include <mach/host_special_ports.h> 30#include <mach/mach_types.h> 31#include <mach/telemetry_notification_server.h> 32 33#include <kern/assert.h> 34#include <kern/clock.h> 35#include <kern/debug.h> 36#include <kern/host.h> 37#include <kern/kalloc.h> 38#include <kern/kern_types.h> 39#include <kern/locks.h> 40#include <kern/misc_protos.h> 41#include <kern/sched.h> 42#include <kern/sched_prim.h> 43#include <kern/telemetry.h> 44#include <kern/timer_call.h> 45 46#include <pexpert/pexpert.h> 47 48#include <vm/vm_kern.h> 49#include <vm/vm_shared_region.h> 50 51#include <kperf/kperf.h> 52#include <kperf/context.h> 53#include <kperf/callstack.h> 54 55#include <sys/kdebug.h> 56#include <uuid/uuid.h> 57#include <kdp/kdp_dyld.h> 58 59#define TELEMETRY_DEBUG 0 60 61extern int proc_pid(void *); 62extern char *proc_name_address(void *p); 63extern uint64_t proc_uniqueid(void *p); 64extern uint64_t proc_was_throttled(void *p); 65extern uint64_t proc_did_throttle(void *p); 66extern uint64_t get_dispatchqueue_serialno_offset_from_proc(void *p); 67extern int proc_selfpid(void); 68 69void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags); 70 71#define TELEMETRY_DEFAULT_SAMPLE_RATE (1) /* 1 sample every 1 second */ 72#define TELEMETRY_DEFAULT_BUFFER_SIZE (16*1024) 73#define TELEMETRY_MAX_BUFFER_SIZE (64*1024) 74 75#define TELEMETRY_DEFAULT_NOTIFY_LEEWAY (4*1024) // Userland gets 4k of leeway to collect data after notification 76#define TELEMETRY_MAX_UUID_COUNT (128) // Max of 128 non-shared-cache UUIDs to log for symbolication 77 78uint32_t telemetry_sample_rate = 0; 79volatile boolean_t telemetry_needs_record = FALSE; 80volatile boolean_t telemetry_needs_timer_arming_record = FALSE; 81 82/* 83 * If TRUE, record micro-stackshot samples for all tasks. 84 * If FALSE, only sample tasks which are marked for telemetry. 85 */ 86boolean_t telemetry_sample_all_tasks = FALSE; 87uint32_t telemetry_active_tasks = 0; // Number of tasks opted into telemetry 88 89uint32_t telemetry_timestamp = 0; 90 91vm_offset_t telemetry_buffer = 0; 92uint32_t telemetry_buffer_size = 0; 93uint32_t telemetry_buffer_current_position = 0; 94uint32_t telemetry_buffer_end_point = 0; // If we've wrapped, where does the last record end? 95int telemetry_bytes_since_last_mark = -1; // How much data since buf was last marked? 96int telemetry_buffer_notify_at = 0; 97 98lck_grp_t telemetry_lck_grp; 99lck_mtx_t telemetry_mtx; 100 101#define TELEMETRY_LOCK() do { lck_mtx_lock(&telemetry_mtx); } while(0) 102#define TELEMETRY_TRY_SPIN_LOCK() lck_mtx_try_lock_spin(&telemetry_mtx) 103#define TELEMETRY_UNLOCK() do { lck_mtx_unlock(&telemetry_mtx); } while(0) 104 105void telemetry_init(void) 106{ 107 kern_return_t ret; 108 uint32_t telemetry_notification_leeway; 109 110 lck_grp_init(&telemetry_lck_grp, "telemetry group", LCK_GRP_ATTR_NULL); 111 lck_mtx_init(&telemetry_mtx, &telemetry_lck_grp, LCK_ATTR_NULL); 112 113 if (!PE_parse_boot_argn("telemetry_buffer_size", &telemetry_buffer_size, sizeof(telemetry_buffer_size))) { 114 telemetry_buffer_size = TELEMETRY_DEFAULT_BUFFER_SIZE; 115 } 116 117 if (telemetry_buffer_size > TELEMETRY_MAX_BUFFER_SIZE) 118 telemetry_buffer_size = TELEMETRY_MAX_BUFFER_SIZE; 119 120 ret = kmem_alloc(kernel_map, &telemetry_buffer, telemetry_buffer_size); 121 if (ret != KERN_SUCCESS) { 122 kprintf("Telemetry: Allocation failed: %d\n", ret); 123 return; 124 } 125 126 if (!PE_parse_boot_argn("telemetry_notification_leeway", &telemetry_notification_leeway, sizeof(telemetry_notification_leeway))) { 127 /* 128 * By default, notify the user to collect the buffer when there is this much space left in the buffer. 129 */ 130 telemetry_notification_leeway = TELEMETRY_DEFAULT_NOTIFY_LEEWAY; 131 } 132 if (telemetry_notification_leeway >= telemetry_buffer_size) { 133 printf("telemetry: nonsensical telemetry_notification_leeway boot-arg %d changed to %d\n", 134 telemetry_notification_leeway, TELEMETRY_DEFAULT_NOTIFY_LEEWAY); 135 telemetry_notification_leeway = TELEMETRY_DEFAULT_NOTIFY_LEEWAY; 136 } 137 telemetry_buffer_notify_at = telemetry_buffer_size - telemetry_notification_leeway; 138 139 if (!PE_parse_boot_argn("telemetry_sample_rate", &telemetry_sample_rate, sizeof(telemetry_sample_rate))) { 140 telemetry_sample_rate = TELEMETRY_DEFAULT_SAMPLE_RATE; 141 } 142 143 /* 144 * To enable telemetry for all tasks, include "telemetry_sample_all_tasks=1" in boot-args. 145 */ 146 if (!PE_parse_boot_argn("telemetry_sample_all_tasks", &telemetry_sample_all_tasks, sizeof(telemetry_sample_all_tasks))) { 147 148 telemetry_sample_all_tasks = TRUE; 149 150 } 151 152 kprintf("Telemetry: Sampling %stasks once per %u second%s\n", 153 (telemetry_sample_all_tasks) ? "all " : "", 154 telemetry_sample_rate, telemetry_sample_rate == 1 ? "" : "s"); 155} 156 157/* 158 * Enable or disable global microstackshots (ie telemetry_sample_all_tasks). 159 * 160 * enable_disable == 1: turn it on 161 * enable_disable == 0: turn it off 162 */ 163void 164telemetry_global_ctl(int enable_disable) 165{ 166 if (enable_disable == 1) { 167 telemetry_sample_all_tasks = TRUE; 168 } else { 169 telemetry_sample_all_tasks = FALSE; 170 } 171} 172 173/* 174 * Opt the given task into or out of the telemetry stream. 175 * 176 * Supported reasons (callers may use any or all of): 177 * TF_CPUMON_WARNING 178 * TF_WAKEMON_WARNING 179 * 180 * enable_disable == 1: turn it on 181 * enable_disable == 0: turn it off 182 */ 183void 184telemetry_task_ctl(task_t task, uint32_t reasons, int enable_disable) 185{ 186 task_lock(task); 187 telemetry_task_ctl_locked(task, reasons, enable_disable); 188 task_unlock(task); 189} 190 191void 192telemetry_task_ctl_locked(task_t task, uint32_t reasons, int enable_disable) 193{ 194 uint32_t origflags; 195 196 assert((reasons != 0) && ((reasons | TF_TELEMETRY) == TF_TELEMETRY)); 197 198 task_lock_assert_owned(task); 199 200 origflags = task->t_flags; 201 202 if (enable_disable == 1) { 203 task->t_flags |= reasons; 204 if ((origflags & TF_TELEMETRY) == 0) { 205 OSIncrementAtomic(&telemetry_active_tasks); 206#if TELEMETRY_DEBUG 207 printf("%s: telemetry OFF -> ON (%d active)\n", proc_name_address(task->bsd_info), telemetry_active_tasks); 208#endif 209 } 210 } else { 211 task->t_flags &= ~reasons; 212 if (((origflags & TF_TELEMETRY) != 0) && ((task->t_flags & TF_TELEMETRY) == 0)) { 213 /* 214 * If this task went from having at least one telemetry bit to having none, 215 * the net change was to disable telemetry for the task. 216 */ 217 OSDecrementAtomic(&telemetry_active_tasks); 218#if TELEMETRY_DEBUG 219 printf("%s: telemetry ON -> OFF (%d active)\n", proc_name_address(task->bsd_info), telemetry_active_tasks); 220#endif 221 } 222 } 223} 224 225/* 226 * Determine if the current thread is eligible for telemetry: 227 * 228 * telemetry_sample_all_tasks: All threads are eligible. This takes precedence. 229 * telemetry_active_tasks: Count of tasks opted in. 230 * task->t_flags & TF_TELEMETRY: This task is opted in. 231 */ 232static boolean_t 233telemetry_is_active(thread_t thread) 234{ 235 if (telemetry_sample_all_tasks == TRUE) { 236 return (TRUE); 237 } 238 239 if ((telemetry_active_tasks > 0) && ((thread->task->t_flags & TF_TELEMETRY) != 0)) { 240 return (TRUE); 241 } 242 243 return (FALSE); 244} 245 246/* 247 * Userland is arming a timer. If we are eligible for such a record, 248 * sample now. No need to do this one at the AST because we're already at 249 * a safe place in this system call. 250 */ 251int telemetry_timer_event(__unused uint64_t deadline, __unused uint64_t interval, __unused uint64_t leeway) 252{ 253 if (telemetry_needs_timer_arming_record == TRUE) { 254 telemetry_needs_timer_arming_record = FALSE; 255 telemetry_take_sample(current_thread(), kTimerArmingRecord | kUserMode); 256 } 257 258 return (0); 259} 260 261/* 262 * Mark the current thread for an interrupt-based 263 * telemetry record, to be sampled at the next AST boundary. 264 */ 265void telemetry_mark_curthread(boolean_t interrupted_userspace) 266{ 267 thread_t thread = current_thread(); 268 269 /* 270 * If telemetry isn't active for this thread, return and try 271 * again next time. 272 */ 273 if (telemetry_is_active(thread) == FALSE) { 274 return; 275 } 276 277 telemetry_needs_record = FALSE; 278 thread_ast_set(thread, interrupted_userspace ? AST_TELEMETRY_USER : AST_TELEMETRY_KERNEL); 279 ast_propagate(thread->ast); 280} 281 282void compute_telemetry(void *arg __unused) 283{ 284 if (telemetry_sample_all_tasks || (telemetry_active_tasks > 0)) { 285 if ((++telemetry_timestamp) % telemetry_sample_rate == 0) { 286 /* 287 * To avoid overloading the system with telemetry ASTs, make 288 * sure we don't add more requests while existing ones 289 * are in-flight. 290 */ 291 if (TELEMETRY_TRY_SPIN_LOCK()) { 292 telemetry_needs_record = TRUE; 293 telemetry_needs_timer_arming_record = TRUE; 294 TELEMETRY_UNLOCK(); 295 } 296 } 297 } 298} 299 300/* 301 * If userland has registered a port for telemetry notifications, send one now. 302 */ 303static void 304telemetry_notify_user(void) 305{ 306 mach_port_t user_port; 307 uint32_t flags = 0; 308 int error; 309 310 error = host_get_telemetry_port(host_priv_self(), &user_port); 311 if ((error != KERN_SUCCESS) || !IPC_PORT_VALID(user_port)) { 312 return; 313 } 314 315 telemetry_notification(user_port, flags); 316} 317 318void telemetry_ast(thread_t thread, boolean_t interrupted_userspace) 319{ 320 uint8_t microsnapshot_flags = kInterruptRecord; 321 322 if (interrupted_userspace) 323 microsnapshot_flags |= kUserMode; 324 325 telemetry_take_sample(thread, microsnapshot_flags); 326} 327 328void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags) 329{ 330 task_t task; 331 void *p; 332 struct kperf_context ctx; 333 struct callstack cs; 334 uint32_t btcount, bti; 335 struct micro_snapshot *msnap; 336 struct task_snapshot *tsnap; 337 struct thread_snapshot *thsnap; 338 clock_sec_t secs; 339 clock_usec_t usecs; 340 vm_size_t framesize; 341 uint32_t current_record_start; 342 uint32_t tmp = 0; 343 boolean_t notify = FALSE; 344 345 if (thread == THREAD_NULL) 346 return; 347 348 task = thread->task; 349 if ((task == TASK_NULL) || (task == kernel_task)) 350 return; 351 352 /* telemetry_XXX accessed outside of lock for instrumentation only */ 353 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_RECORD) | DBG_FUNC_START, microsnapshot_flags, telemetry_bytes_since_last_mark, 0, 0, 0); 354 355 p = get_bsdtask_info(task); 356 357 ctx.cur_thread = thread; 358 ctx.cur_pid = proc_pid(p); 359 360 /* 361 * Gather up the data we'll need for this sample. The sample is written into the kernel 362 * buffer with the global telemetry lock held -- so we must do our (possibly faulting) 363 * copies from userland here, before taking the lock. 364 */ 365 kperf_ucallstack_sample(&cs, &ctx); 366 if (!(cs.flags & CALLSTACK_VALID)) 367 return; 368 369 /* 370 * Find the actual [slid] address of the shared cache's UUID, and copy it in from userland. 371 */ 372 int shared_cache_uuid_valid = 0; 373 uint64_t shared_cache_base_address; 374 struct _dyld_cache_header shared_cache_header; 375 uint64_t shared_cache_slide; 376 377 /* 378 * Don't copy in the entire shared cache header; we only need the UUID. Calculate the 379 * offset of that one field. 380 */ 381 int sc_header_uuid_offset = (char *)&shared_cache_header.uuid - (char *)&shared_cache_header; 382 vm_shared_region_t sr = vm_shared_region_get(task); 383 if (sr != NULL) { 384 if ((vm_shared_region_start_address(sr, &shared_cache_base_address) == KERN_SUCCESS) && 385 (copyin(shared_cache_base_address + sc_header_uuid_offset, (char *)&shared_cache_header.uuid, 386 sizeof (shared_cache_header.uuid)) == 0)) { 387 shared_cache_uuid_valid = 1; 388 shared_cache_slide = vm_shared_region_get_slide(sr); 389 } 390 // vm_shared_region_get() gave us a reference on the shared region. 391 vm_shared_region_deallocate(sr); 392 } 393 394 /* 395 * Retrieve the array of UUID's for binaries used by this task. 396 * We reach down into DYLD's data structures to find the array. 397 * 398 * XXX - make this common with kdp? 399 */ 400 uint32_t uuid_info_count = 0; 401 mach_vm_address_t uuid_info_addr = 0; 402 if (task_has_64BitAddr(task)) { 403 struct user64_dyld_all_image_infos task_image_infos; 404 if (copyin(task->all_image_info_addr, (char *)&task_image_infos, sizeof(task_image_infos)) == 0) { 405 uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount; 406 uuid_info_addr = task_image_infos.uuidArray; 407 } 408 } else { 409 struct user32_dyld_all_image_infos task_image_infos; 410 if (copyin(task->all_image_info_addr, (char *)&task_image_infos, sizeof(task_image_infos)) == 0) { 411 uuid_info_count = task_image_infos.uuidArrayCount; 412 uuid_info_addr = task_image_infos.uuidArray; 413 } 414 } 415 416 /* 417 * If we get a NULL uuid_info_addr (which can happen when we catch dyld in the middle of updating 418 * this data structure), we zero the uuid_info_count so that we won't even try to save load info 419 * for this task. 420 */ 421 if (!uuid_info_addr) { 422 uuid_info_count = 0; 423 } 424 425 /* 426 * Don't copy in an unbounded amount of memory. The main binary and interesting 427 * non-shared-cache libraries should be in the first few images. 428 */ 429 if (uuid_info_count > TELEMETRY_MAX_UUID_COUNT) { 430 uuid_info_count = TELEMETRY_MAX_UUID_COUNT; 431 } 432 433 uint32_t uuid_info_size = (uint32_t)(task_has_64BitAddr(thread->task) ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info)); 434 uint32_t uuid_info_array_size = uuid_info_count * uuid_info_size; 435 char *uuid_info_array = NULL; 436 437 if (uuid_info_count > 0) { 438 if ((uuid_info_array = (char *)kalloc(uuid_info_array_size)) == NULL) { 439 return; 440 } 441 442 /* 443 * Copy in the UUID info array. 444 * It may be nonresident, in which case just fix up nloadinfos to 0 in the task snapshot. 445 */ 446 if (copyin(uuid_info_addr, uuid_info_array, uuid_info_array_size) != 0) { 447 kfree(uuid_info_array, uuid_info_array_size); 448 uuid_info_array = NULL; 449 uuid_info_array_size = 0; 450 } 451 } 452 453 /* 454 * Look for a dispatch queue serial number, and copy it in from userland if present. 455 */ 456 uint64_t dqserialnum = 0; 457 int dqserialnum_valid = 0; 458 459 uint64_t dqkeyaddr = thread_dispatchqaddr(thread); 460 if (dqkeyaddr != 0) { 461 uint64_t dqaddr = 0; 462 uint64_t dq_serialno_offset = get_dispatchqueue_serialno_offset_from_proc(task->bsd_info); 463 if ((copyin(dqkeyaddr, (char *)&dqaddr, (task_has_64BitAddr(task) ? 8 : 4)) == 0) && 464 (dqaddr != 0) && (dq_serialno_offset != 0)) { 465 uint64_t dqserialnumaddr = dqaddr + dq_serialno_offset; 466 if (copyin(dqserialnumaddr, (char *)&dqserialnum, (task_has_64BitAddr(task) ? 8 : 4)) == 0) { 467 dqserialnum_valid = 1; 468 } 469 } 470 } 471 472 clock_get_calendar_microtime(&secs, &usecs); 473 474 TELEMETRY_LOCK(); 475 476 /* 477 * We do the bulk of the operation under the telemetry lock, on assumption that 478 * any page faults during execution will not cause another AST_TELEMETRY_ALL 479 * to deadlock; they will just block until we finish. This makes it easier 480 * to copy into the buffer directly. As soon as we unlock, userspace can copy 481 * out of our buffer. 482 */ 483 484copytobuffer: 485 486 current_record_start = telemetry_buffer_current_position; 487 488 if ((telemetry_buffer_size - telemetry_buffer_current_position) < sizeof(struct micro_snapshot)) { 489 /* 490 * We can't fit a record in the space available, so wrap around to the beginning. 491 * Save the current position as the known end point of valid data. 492 */ 493 telemetry_buffer_end_point = current_record_start; 494 telemetry_buffer_current_position = 0; 495 if (current_record_start == 0) { 496 /* This sample is too large to fit in the buffer even when we started at 0, so skip it */ 497 goto cancel_sample; 498 } 499 goto copytobuffer; 500 } 501 502 msnap = (struct micro_snapshot *)(uintptr_t)(telemetry_buffer + telemetry_buffer_current_position); 503 msnap->snapshot_magic = STACKSHOT_MICRO_SNAPSHOT_MAGIC; 504 msnap->ms_flags = microsnapshot_flags; 505 msnap->ms_opaque_flags = 0; /* namespace managed by userspace */ 506 msnap->ms_cpu = 0; /* XXX - does this field make sense for a micro-stackshot? */ 507 msnap->ms_time = secs; 508 msnap->ms_time_microsecs = usecs; 509 510 telemetry_buffer_current_position += sizeof(struct micro_snapshot); 511 512 if ((telemetry_buffer_size - telemetry_buffer_current_position) < sizeof(struct task_snapshot)) { 513 telemetry_buffer_end_point = current_record_start; 514 telemetry_buffer_current_position = 0; 515 if (current_record_start == 0) { 516 /* This sample is too large to fit in the buffer even when we started at 0, so skip it */ 517 goto cancel_sample; 518 } 519 goto copytobuffer; 520 } 521 522 tsnap = (struct task_snapshot *)(uintptr_t)(telemetry_buffer + telemetry_buffer_current_position); 523 bzero(tsnap, sizeof(*tsnap)); 524 tsnap->snapshot_magic = STACKSHOT_TASK_SNAPSHOT_MAGIC; 525 tsnap->pid = proc_pid(p); 526 tsnap->uniqueid = proc_uniqueid(p); 527 tsnap->user_time_in_terminated_threads = task->total_user_time; 528 tsnap->system_time_in_terminated_threads = task->total_system_time; 529 tsnap->suspend_count = task->suspend_count; 530 tsnap->task_size = pmap_resident_count(task->map->pmap); 531 tsnap->faults = task->faults; 532 tsnap->pageins = task->pageins; 533 tsnap->cow_faults = task->cow_faults; 534 /* 535 * The throttling counters are maintained as 64-bit counters in the proc 536 * structure. However, we reserve 32-bits (each) for them in the task_snapshot 537 * struct to save space and since we do not expect them to overflow 32-bits. If we 538 * find these values overflowing in the future, the fix would be to simply 539 * upgrade these counters to 64-bit in the task_snapshot struct 540 */ 541 tsnap->was_throttled = (uint32_t) proc_was_throttled(p); 542 tsnap->did_throttle = (uint32_t) proc_did_throttle(p); 543 544 if (task->t_flags & TF_TELEMETRY) { 545 tsnap->ss_flags |= kTaskRsrcFlagged; 546 } 547 548 if (task->effective_policy.darwinbg == 1) { 549 tsnap->ss_flags |= kTaskDarwinBG; 550 } 551 552 proc_get_darwinbgstate(task, &tmp); 553 554 if (task->requested_policy.t_role == TASK_FOREGROUND_APPLICATION) { 555 tsnap->ss_flags |= kTaskIsForeground; 556 } 557 558 if (tmp & PROC_FLAG_ADAPTIVE_IMPORTANT) { 559 tsnap->ss_flags |= kTaskIsBoosted; 560 } 561 562 if (tmp & PROC_FLAG_SUPPRESSED) { 563 tsnap->ss_flags |= kTaskIsSuppressed; 564 } 565 566 tsnap->latency_qos = task_grab_latency_qos(task); 567 568 strlcpy(tsnap->p_comm, proc_name_address(p), sizeof(tsnap->p_comm)); 569 if (task_has_64BitAddr(thread->task)) { 570 tsnap->ss_flags |= kUser64_p; 571 } 572 573 if (shared_cache_uuid_valid) { 574 tsnap->shared_cache_slide = shared_cache_slide; 575 bcopy(shared_cache_header.uuid, tsnap->shared_cache_identifier, sizeof (shared_cache_header.uuid)); 576 } 577 578 telemetry_buffer_current_position += sizeof(struct task_snapshot); 579 580 /* 581 * Directly after the task snapshot, place the array of UUID's corresponding to the binaries 582 * used by this task. 583 */ 584 if ((telemetry_buffer_size - telemetry_buffer_current_position) < uuid_info_array_size) { 585 telemetry_buffer_end_point = current_record_start; 586 telemetry_buffer_current_position = 0; 587 if (current_record_start == 0) { 588 /* This sample is too large to fit in the buffer even when we started at 0, so skip it */ 589 goto cancel_sample; 590 } 591 goto copytobuffer; 592 } 593 594 /* 595 * Copy the UUID info array into our sample. 596 */ 597 if (uuid_info_array_size > 0) { 598 bcopy(uuid_info_array, (char *)(telemetry_buffer + telemetry_buffer_current_position), uuid_info_array_size); 599 tsnap->nloadinfos = uuid_info_count; 600 } 601 602 telemetry_buffer_current_position += uuid_info_array_size; 603 604 /* 605 * After the task snapshot & list of binary UUIDs, we place a thread snapshot. 606 */ 607 608 if ((telemetry_buffer_size - telemetry_buffer_current_position) < sizeof(struct thread_snapshot)) { 609 /* wrap and overwrite */ 610 telemetry_buffer_end_point = current_record_start; 611 telemetry_buffer_current_position = 0; 612 if (current_record_start == 0) { 613 /* This sample is too large to fit in the buffer even when we started at 0, so skip it */ 614 goto cancel_sample; 615 } 616 goto copytobuffer; 617 } 618 619 thsnap = (struct thread_snapshot *)(uintptr_t)(telemetry_buffer + telemetry_buffer_current_position); 620 bzero(thsnap, sizeof(*thsnap)); 621 622 thsnap->snapshot_magic = STACKSHOT_THREAD_SNAPSHOT_MAGIC; 623 thsnap->thread_id = thread_tid(thread); 624 thsnap->state = thread->state; 625 thsnap->priority = thread->priority; 626 thsnap->sched_pri = thread->sched_pri; 627 thsnap->sched_flags = thread->sched_flags; 628 thsnap->ss_flags |= kStacksPCOnly; 629 630 if (thread->effective_policy.darwinbg) { 631 thsnap->ss_flags |= kThreadDarwinBG; 632 } 633 634 thsnap->user_time = timer_grab(&thread->user_timer); 635 636 uint64_t tval = timer_grab(&thread->system_timer); 637 638 if (thread->precise_user_kernel_time) { 639 thsnap->system_time = tval; 640 } else { 641 thsnap->user_time += tval; 642 thsnap->system_time = 0; 643 } 644 645 telemetry_buffer_current_position += sizeof(struct thread_snapshot); 646 647 /* 648 * If this thread has a dispatch queue serial number, include it here. 649 */ 650 if (dqserialnum_valid) { 651 if ((telemetry_buffer_size - telemetry_buffer_current_position) < sizeof(dqserialnum)) { 652 /* wrap and overwrite */ 653 telemetry_buffer_end_point = current_record_start; 654 telemetry_buffer_current_position = 0; 655 if (current_record_start == 0) { 656 /* This sample is too large to fit in the buffer even when we started at 0, so skip it */ 657 goto cancel_sample; 658 } 659 goto copytobuffer; 660 } 661 662 thsnap->ss_flags |= kHasDispatchSerial; 663 bcopy(&dqserialnum, (char *)telemetry_buffer + telemetry_buffer_current_position, sizeof (dqserialnum)); 664 telemetry_buffer_current_position += sizeof (dqserialnum); 665 } 666 667 if (task_has_64BitAddr(task)) { 668 framesize = 8; 669 thsnap->ss_flags |= kUser64_p; 670 } else { 671 framesize = 4; 672 } 673 674 btcount = cs.nframes; 675 676 /* 677 * If we can't fit this entire stacktrace then cancel this record, wrap to the beginning, 678 * and start again there so that we always store a full record. 679 */ 680 if ((telemetry_buffer_size - telemetry_buffer_current_position)/framesize < btcount) { 681 telemetry_buffer_end_point = current_record_start; 682 telemetry_buffer_current_position = 0; 683 if (current_record_start == 0) { 684 /* This sample is too large to fit in the buffer even when we started at 0, so skip it */ 685 goto cancel_sample; 686 } 687 goto copytobuffer; 688 } 689 690 for (bti=0; bti < btcount; bti++, telemetry_buffer_current_position += framesize) { 691 if (framesize == 8) { 692 *(uint64_t *)(uintptr_t)(telemetry_buffer + telemetry_buffer_current_position) = cs.frames[bti]; 693 } else { 694 *(uint32_t *)(uintptr_t)(telemetry_buffer + telemetry_buffer_current_position) = (uint32_t)cs.frames[bti]; 695 } 696 } 697 698 if (telemetry_buffer_end_point < telemetry_buffer_current_position) { 699 /* 700 * Each time the cursor wraps around to the beginning, we leave a 701 * differing amount of unused space at the end of the buffer. Make 702 * sure the cursor pushes the end point in case we're making use of 703 * more of the buffer than we did the last time we wrapped. 704 */ 705 telemetry_buffer_end_point = telemetry_buffer_current_position; 706 } 707 708 thsnap->nuser_frames = btcount; 709 710 telemetry_bytes_since_last_mark += (telemetry_buffer_current_position - current_record_start); 711 if (telemetry_bytes_since_last_mark > telemetry_buffer_notify_at) { 712 notify = TRUE; 713 } 714 715cancel_sample: 716 717 TELEMETRY_UNLOCK(); 718 719 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_RECORD) | DBG_FUNC_END, notify, telemetry_bytes_since_last_mark, telemetry_buffer_current_position, telemetry_buffer_end_point, 0); 720 721 if (notify) { 722 telemetry_notify_user(); 723 } 724 725 if (uuid_info_array != NULL) { 726 kfree(uuid_info_array, uuid_info_array_size); 727 } 728} 729 730#if TELEMETRY_DEBUG 731static void 732log_telemetry_output(vm_offset_t buf, uint32_t pos, uint32_t sz) 733{ 734 struct micro_snapshot *p; 735 uint32_t offset; 736 737 printf("Copying out %d bytes of telemetry at offset %d\n", sz, pos); 738 739 buf += pos; 740 741 /* 742 * Find and log each timestamp in this chunk of buffer. 743 */ 744 for (offset = 0; offset < sz; offset++) { 745 p = (struct micro_snapshot *)(buf + offset); 746 if (p->snapshot_magic == STACKSHOT_MICRO_SNAPSHOT_MAGIC) { 747 printf("telemetry timestamp: %lld\n", p->ms_time); 748 } 749 } 750} 751#endif 752 753int telemetry_gather(user_addr_t buffer, uint32_t *length, boolean_t mark) 754{ 755 int result = 0; 756 uint32_t oldest_record_offset; 757 758 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_GATHER) | DBG_FUNC_START, mark, telemetry_bytes_since_last_mark, 0, 0, 0); 759 760 TELEMETRY_LOCK(); 761 762 if (telemetry_buffer == 0) { 763 *length = 0; 764 goto out; 765 } 766 767 if (*length < telemetry_buffer_size) { 768 result = KERN_NO_SPACE; 769 goto out; 770 } 771 772 /* 773 * Copy the ring buffer out to userland in order sorted by time: least recent to most recent. 774 * First, we need to search forward from the cursor to find the oldest record in our buffer. 775 */ 776 oldest_record_offset = telemetry_buffer_current_position; 777 do { 778 if ((oldest_record_offset == telemetry_buffer_size) || 779 (oldest_record_offset == telemetry_buffer_end_point)) { 780 781 if (*(uint32_t *)(uintptr_t)(telemetry_buffer) == 0) { 782 /* 783 * There is no magic number at the start of the buffer, which means 784 * it's empty; nothing to see here yet. 785 */ 786 *length = 0; 787 goto out; 788 } 789 /* 790 * We've looked through the end of the active buffer without finding a valid 791 * record; that means all valid records are in a single chunk, beginning at 792 * the very start of the buffer. 793 */ 794 795 oldest_record_offset = 0; 796 assert(*(uint32_t *)(uintptr_t)(telemetry_buffer) == STACKSHOT_MICRO_SNAPSHOT_MAGIC); 797 break; 798 } 799 800 if (*(uint32_t *)(uintptr_t)(telemetry_buffer + oldest_record_offset) == STACKSHOT_MICRO_SNAPSHOT_MAGIC) 801 break; 802 803 /* 804 * There are no alignment guarantees for micro-stackshot records, so we must search at each 805 * byte offset. 806 */ 807 oldest_record_offset++; 808 } while (oldest_record_offset != telemetry_buffer_current_position); 809 810 /* 811 * If needed, copyout in two chunks: from the oldest record to the end of the buffer, and then 812 * from the beginning of the buffer up to the current position. 813 */ 814 if (oldest_record_offset != 0) { 815#if TELEMETRY_DEBUG 816 log_telemetry_output(telemetry_buffer, oldest_record_offset, 817 telemetry_buffer_end_point - oldest_record_offset); 818#endif 819 if ((result = copyout((void *)(telemetry_buffer + oldest_record_offset), buffer, 820 telemetry_buffer_end_point - oldest_record_offset)) != 0) { 821 *length = 0; 822 goto out; 823 } 824 *length = telemetry_buffer_end_point - oldest_record_offset; 825 } else { 826 *length = 0; 827 } 828 829#if TELEMETRY_DEBUG 830 log_telemetry_output(telemetry_buffer, 0, telemetry_buffer_current_position); 831#endif 832 if ((result = copyout((void *)telemetry_buffer, buffer + *length, 833 telemetry_buffer_current_position)) != 0) { 834 *length = 0; 835 goto out; 836 } 837 *length += (uint32_t)telemetry_buffer_current_position; 838 839out: 840 841 if (mark && (*length > 0)) { 842 telemetry_bytes_since_last_mark = 0; 843 } 844 845 TELEMETRY_UNLOCK(); 846 847 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_GATHER) | DBG_FUNC_END, telemetry_buffer_current_position, *length, telemetry_buffer_end_point, 0, 0); 848 849 return (result); 850} 851 852/************************/ 853/* BOOT PROFILE SUPPORT */ 854/************************/ 855/* 856 * Boot Profiling 857 * 858 * The boot-profiling support is a mechanism to sample activity happening on the 859 * system during boot. This mechanism sets up a periodic timer and on every timer fire, 860 * captures a full backtrace into the boot profiling buffer. This buffer can be pulled 861 * out and analyzed from user-space. It is turned on using the following boot-args: 862 * "bootprofile_buffer_size" specifies the size of the boot profile buffer 863 * "bootprofile_interval_ms" specifies the interval for the profiling timer 864 * 865 * Process Specific Boot Profiling 866 * 867 * The boot-arg "bootprofile_proc_name" can be used to specify a certain 868 * process that needs to profiled during boot. Setting this boot-arg changes 869 * the way stackshots are captured. At every timer fire, the code looks at the 870 * currently running process and takes a stackshot only if the requested process 871 * is on-core (which makes it unsuitable for MP systems). 872 * 873 */ 874 875#define BOOTPROFILE_MAX_BUFFER_SIZE (64*1024*1024) /* see also COPYSIZELIMIT_PANIC */ 876 877vm_offset_t bootprofile_buffer = 0; 878uint32_t bootprofile_buffer_size = 0; 879uint32_t bootprofile_buffer_current_position = 0; 880uint32_t bootprofile_interval_ms = 0; 881uint64_t bootprofile_interval_abs = 0; 882uint64_t bootprofile_next_deadline = 0; 883uint32_t bootprofile_all_procs = 0; 884char bootprofile_proc_name[17]; 885 886lck_grp_t bootprofile_lck_grp; 887lck_mtx_t bootprofile_mtx; 888 889static timer_call_data_t bootprofile_timer_call_entry; 890 891#define BOOTPROFILE_LOCK() do { lck_mtx_lock(&bootprofile_mtx); } while(0) 892#define BOOTPROFILE_TRY_SPIN_LOCK() lck_mtx_try_lock_spin(&bootprofile_mtx) 893#define BOOTPROFILE_UNLOCK() do { lck_mtx_unlock(&bootprofile_mtx); } while(0) 894 895static void bootprofile_timer_call( 896 timer_call_param_t param0, 897 timer_call_param_t param1); 898 899extern int 900stack_snapshot_from_kernel(int pid, void *buf, uint32_t size, uint32_t flags, unsigned *retbytes); 901 902void bootprofile_init(void) 903{ 904 kern_return_t ret; 905 906 lck_grp_init(&bootprofile_lck_grp, "bootprofile group", LCK_GRP_ATTR_NULL); 907 lck_mtx_init(&bootprofile_mtx, &bootprofile_lck_grp, LCK_ATTR_NULL); 908 909 if (!PE_parse_boot_argn("bootprofile_buffer_size", &bootprofile_buffer_size, sizeof(bootprofile_buffer_size))) { 910 bootprofile_buffer_size = 0; 911 } 912 913 if (bootprofile_buffer_size > BOOTPROFILE_MAX_BUFFER_SIZE) 914 bootprofile_buffer_size = BOOTPROFILE_MAX_BUFFER_SIZE; 915 916 if (!PE_parse_boot_argn("bootprofile_interval_ms", &bootprofile_interval_ms, sizeof(bootprofile_interval_ms))) { 917 bootprofile_interval_ms = 0; 918 } 919 920 if (!PE_parse_boot_argn("bootprofile_proc_name", &bootprofile_proc_name, sizeof(bootprofile_proc_name))) { 921 bootprofile_all_procs = 1; 922 bootprofile_proc_name[0] = '\0'; 923 } 924 925 clock_interval_to_absolutetime_interval(bootprofile_interval_ms, NSEC_PER_MSEC, &bootprofile_interval_abs); 926 927 /* Both boot args must be set to enable */ 928 if ((bootprofile_buffer_size == 0) || (bootprofile_interval_abs == 0)) { 929 return; 930 } 931 932 ret = kmem_alloc(kernel_map, &bootprofile_buffer, bootprofile_buffer_size); 933 if (ret != KERN_SUCCESS) { 934 kprintf("Boot profile: Allocation failed: %d\n", ret); 935 return; 936 } 937 938 kprintf("Boot profile: Sampling %s once per %u ms\n", bootprofile_all_procs ? "all procs" : bootprofile_proc_name, bootprofile_interval_ms); 939 940 timer_call_setup(&bootprofile_timer_call_entry, 941 bootprofile_timer_call, 942 NULL); 943 944 bootprofile_next_deadline = mach_absolute_time() + bootprofile_interval_abs; 945 timer_call_enter_with_leeway(&bootprofile_timer_call_entry, 946 NULL, 947 bootprofile_next_deadline, 948 0, 949 TIMER_CALL_SYS_NORMAL, 950 FALSE); 951} 952 953static void bootprofile_timer_call( 954 timer_call_param_t param0 __unused, 955 timer_call_param_t param1 __unused) 956{ 957 unsigned retbytes = 0; 958 int pid_to_profile = -1; 959 960 if (!BOOTPROFILE_TRY_SPIN_LOCK()) { 961 goto reprogram; 962 } 963 964 /* Check if process-specific boot profiling is turned on */ 965 if (!bootprofile_all_procs) { 966 /* 967 * Since boot profiling initializes really early in boot, it is 968 * possible that at this point, the task/proc is not initialized. 969 * Nothing to do in that case. 970 */ 971 972 if ((current_task() != NULL) && (current_task()->bsd_info != NULL) && 973 (0 == strncmp(bootprofile_proc_name, proc_name_address(current_task()->bsd_info), 17))) { 974 pid_to_profile = proc_selfpid(); 975 } 976 else { 977 /* 978 * Process-specific boot profiling requested but the on-core process is 979 * something else. Nothing to do here. 980 */ 981 BOOTPROFILE_UNLOCK(); 982 goto reprogram; 983 } 984 } 985 986 /* initiate a stackshot with whatever portion of the buffer is left */ 987 if (bootprofile_buffer_current_position < bootprofile_buffer_size) { 988 stack_snapshot_from_kernel( 989 pid_to_profile, 990 (void *)(bootprofile_buffer + bootprofile_buffer_current_position), 991 bootprofile_buffer_size - bootprofile_buffer_current_position, 992 STACKSHOT_SAVE_LOADINFO | STACKSHOT_SAVE_KEXT_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS, 993 &retbytes 994 ); 995 996 bootprofile_buffer_current_position += retbytes; 997 } 998 999 BOOTPROFILE_UNLOCK(); 1000 1001 /* If we didn't get any data or have run out of buffer space, stop profiling */ 1002 if ((retbytes == 0) || (bootprofile_buffer_current_position == bootprofile_buffer_size)) { 1003 return; 1004 } 1005 1006 1007reprogram: 1008 /* If the user gathered the buffer, no need to keep profiling */ 1009 if (bootprofile_interval_abs == 0) { 1010 return; 1011 } 1012 1013 clock_deadline_for_periodic_event(bootprofile_interval_abs, 1014 mach_absolute_time(), 1015 &bootprofile_next_deadline); 1016 timer_call_enter_with_leeway(&bootprofile_timer_call_entry, 1017 NULL, 1018 bootprofile_next_deadline, 1019 0, 1020 TIMER_CALL_SYS_NORMAL, 1021 FALSE); 1022} 1023 1024int bootprofile_gather(user_addr_t buffer, uint32_t *length) 1025{ 1026 int result = 0; 1027 1028 BOOTPROFILE_LOCK(); 1029 1030 if (bootprofile_buffer == 0) { 1031 *length = 0; 1032 goto out; 1033 } 1034 1035 if (*length < bootprofile_buffer_current_position) { 1036 result = KERN_NO_SPACE; 1037 goto out; 1038 } 1039 1040 if ((result = copyout((void *)bootprofile_buffer, buffer, 1041 bootprofile_buffer_current_position)) != 0) { 1042 *length = 0; 1043 goto out; 1044 } 1045 *length = bootprofile_buffer_current_position; 1046 1047 /* cancel future timers */ 1048 bootprofile_interval_abs = 0; 1049 1050out: 1051 1052 BOOTPROFILE_UNLOCK(); 1053 1054 return (result); 1055} 1056