1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26/* #pragma ident "@(#)profile.c 1.7 07/01/10 SMI" */ 27 28#ifdef KERNEL 29#ifndef _KERNEL 30#define _KERNEL /* Solaris vs. Darwin */ 31#endif 32#endif 33 34#include <kern/cpu_data.h> 35#include <kern/thread.h> 36#include <kern/assert.h> 37#include <mach/thread_status.h> 38 39#include <sys/param.h> 40#include <sys/systm.h> 41#include <sys/errno.h> 42#include <sys/stat.h> 43#include <sys/ioctl.h> 44#include <sys/conf.h> 45#include <sys/fcntl.h> 46#include <miscfs/devfs/devfs.h> 47 48#include <sys/dtrace.h> 49#include <sys/dtrace_impl.h> 50 51#include <sys/dtrace_glue.h> 52 53#include <machine/pal_routines.h> 54 55#if defined(__x86_64__) 56extern x86_saved_state_t *find_kern_regs(thread_t); 57#else 58#error Unknown architecture 59#endif 60 61#undef ASSERT 62#define ASSERT(x) do {} while(0) 63 64extern void profile_init(void); 65 66static dev_info_t *profile_devi; 67static dtrace_provider_id_t profile_id; 68 69/* 70 * Regardless of platform, the stack frames look like this in the case of the 71 * profile provider: 72 * 73 * profile_fire 74 * cyclic_expire 75 * cyclic_fire 76 * [ cbe ] 77 * [ interrupt code ] 78 * 79 * On x86, there are five frames from the generic interrupt code; further, the 80 * interrupted instruction appears as its own stack frame, giving us a total of 81 * 10. 82 * 83 * On SPARC, the picture is further complicated because the compiler 84 * optimizes away tail-calls -- so the following frames are optimized away: 85 * 86 * profile_fire 87 * cyclic_expire 88 * 89 * This gives three frames. However, on DEBUG kernels, the cyclic_expire 90 * frame cannot be tail-call eliminated, yielding four frames in this case. 91 * 92 * All of the above constraints lead to the mess below. Yes, the profile 93 * provider should ideally figure this out on-the-fly by hitting one of its own 94 * probes and then walking its own stack trace. This is complicated, however, 95 * and the static definition doesn't seem to be overly brittle. Still, we 96 * allow for a manual override in case we get it completely wrong. 97 */ 98 99#if defined(__x86_64__) 100#define PROF_ARTIFICIAL_FRAMES 9 101#else 102#error Unknown architecture 103#endif 104 105#define PROF_NAMELEN 15 106 107#define PROF_PROFILE 0 108#define PROF_TICK 1 109#define PROF_PREFIX_PROFILE "profile-" 110#define PROF_PREFIX_TICK "tick-" 111 112typedef struct profile_probe { 113 char prof_name[PROF_NAMELEN]; 114 dtrace_id_t prof_id; 115 int prof_kind; 116 hrtime_t prof_interval; 117 cyclic_id_t prof_cyclic; 118} profile_probe_t; 119 120typedef struct profile_probe_percpu { 121 hrtime_t profc_expected; 122 hrtime_t profc_interval; 123 profile_probe_t *profc_probe; 124} profile_probe_percpu_t; 125 126hrtime_t profile_interval_min = NANOSEC / 5000; /* 5000 hz */ 127int profile_aframes = 0; /* override */ 128 129static int profile_rates[] = { 130 97, 199, 499, 997, 1999, 131 4001, 4999, 0, 0, 0, 132 0, 0, 0, 0, 0, 133 0, 0, 0, 0, 0 134}; 135 136static int profile_ticks[] = { 137 1, 10, 100, 500, 1000, 138 5000, 0, 0, 0, 0, 139 0, 0, 0, 0, 0 140}; 141 142/* 143 * profile_max defines the upper bound on the number of profile probes that 144 * can exist (this is to prevent malicious or clumsy users from exhausing 145 * system resources by creating a slew of profile probes). At mod load time, 146 * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's 147 * present in the profile.conf file. 148 */ 149#define PROFILE_MAX_DEFAULT 1000 /* default max. number of probes */ 150static uint32_t profile_max; /* maximum number of profile probes */ 151static uint32_t profile_total; /* current number of profile probes */ 152 153static void 154profile_fire(void *arg) 155{ 156 profile_probe_percpu_t *pcpu = arg; 157 profile_probe_t *prof = pcpu->profc_probe; 158 hrtime_t late; 159 160 late = dtrace_gethrtime() - pcpu->profc_expected; 161 pcpu->profc_expected += pcpu->profc_interval; 162 163#if defined(__x86_64__) 164 x86_saved_state_t *kern_regs = find_kern_regs(current_thread()); 165 166 if (NULL != kern_regs) { 167 /* Kernel was interrupted. */ 168 dtrace_probe(prof->prof_id, saved_state64(kern_regs)->isf.rip, 0x0, late, 0, 0); 169 170 } else { 171 pal_register_cache_state(current_thread(), VALID); 172 /* Possibly a user interrupt */ 173 x86_saved_state_t *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread()); 174 175 if (NULL == tagged_regs) { 176 /* Too bad, so sad, no useful interrupt state. */ 177 dtrace_probe(prof->prof_id, 0xcafebabe, 178 0x0, late, 0, 0); /* XXX_BOGUS also see profile_usermode() below. */ 179 } else if (is_saved_state64(tagged_regs)) { 180 x86_saved_state64_t *regs = saved_state64(tagged_regs); 181 182 dtrace_probe(prof->prof_id, 0x0, regs->isf.rip, late, 0, 0); 183 } else { 184 x86_saved_state32_t *regs = saved_state32(tagged_regs); 185 186 dtrace_probe(prof->prof_id, 0x0, regs->eip, late, 0, 0); 187 } 188 } 189#else 190#error Unknown architecture 191#endif 192} 193 194static void 195profile_tick(void *arg) 196{ 197 profile_probe_t *prof = arg; 198 199#if defined(__x86_64__) 200 x86_saved_state_t *kern_regs = find_kern_regs(current_thread()); 201 202 if (NULL != kern_regs) { 203 /* Kernel was interrupted. */ 204 dtrace_probe(prof->prof_id, saved_state64(kern_regs)->isf.rip, 0x0, 0, 0, 0); 205 } else { 206 pal_register_cache_state(current_thread(), VALID); 207 /* Possibly a user interrupt */ 208 x86_saved_state_t *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread()); 209 210 if (NULL == tagged_regs) { 211 /* Too bad, so sad, no useful interrupt state. */ 212 dtrace_probe(prof->prof_id, 0xcafebabe, 213 0x0, 0, 0, 0); /* XXX_BOGUS also see profile_usermode() below. */ 214 } else if (is_saved_state64(tagged_regs)) { 215 x86_saved_state64_t *regs = saved_state64(tagged_regs); 216 217 dtrace_probe(prof->prof_id, 0x0, regs->isf.rip, 0, 0, 0); 218 } else { 219 x86_saved_state32_t *regs = saved_state32(tagged_regs); 220 221 dtrace_probe(prof->prof_id, 0x0, regs->eip, 0, 0, 0); 222 } 223 } 224#else 225#error Unknown architecture 226#endif 227} 228 229static void 230profile_create(hrtime_t interval, const char *name, int kind) 231{ 232 profile_probe_t *prof; 233 234 if (interval < profile_interval_min) 235 return; 236 237 if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0) 238 return; 239 240 atomic_add_32(&profile_total, 1); 241 if (profile_total > profile_max) { 242 atomic_add_32(&profile_total, -1); 243 return; 244 } 245 246 if (PROF_TICK == kind) 247 prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP); 248 else 249 prof = kmem_zalloc(sizeof (profile_probe_t) + NCPU*sizeof(profile_probe_percpu_t), KM_SLEEP); 250 251 (void) strlcpy(prof->prof_name, name, sizeof(prof->prof_name)); 252 prof->prof_interval = interval; 253 prof->prof_cyclic = CYCLIC_NONE; 254 prof->prof_kind = kind; 255 prof->prof_id = dtrace_probe_create(profile_id, 256 NULL, NULL, name, 257 profile_aframes ? profile_aframes : PROF_ARTIFICIAL_FRAMES, prof); 258} 259 260/*ARGSUSED*/ 261static void 262profile_provide(void *arg, const dtrace_probedesc_t *desc) 263{ 264#pragma unused(arg) /* __APPLE__ */ 265 int i, j, rate, kind; 266 hrtime_t val = 0, mult = 1, len; 267 const char *name, *suffix = NULL; 268 269 const struct { 270 const char *prefix; 271 int kind; 272 } types[] = { 273 { PROF_PREFIX_PROFILE, PROF_PROFILE }, 274 { PROF_PREFIX_TICK, PROF_TICK }, 275 { NULL, 0 } 276 }; 277 278 const struct { 279 const char *name; 280 hrtime_t mult; 281 } suffixes[] = { 282 { "ns", NANOSEC / NANOSEC }, 283 { "nsec", NANOSEC / NANOSEC }, 284 { "us", NANOSEC / MICROSEC }, 285 { "usec", NANOSEC / MICROSEC }, 286 { "ms", NANOSEC / MILLISEC }, 287 { "msec", NANOSEC / MILLISEC }, 288 { "s", NANOSEC / SEC }, 289 { "sec", NANOSEC / SEC }, 290 { "m", NANOSEC * (hrtime_t)60 }, 291 { "min", NANOSEC * (hrtime_t)60 }, 292 { "h", NANOSEC * (hrtime_t)(60 * 60) }, 293 { "hour", NANOSEC * (hrtime_t)(60 * 60) }, 294 { "d", NANOSEC * (hrtime_t)(24 * 60 * 60) }, 295 { "day", NANOSEC * (hrtime_t)(24 * 60 * 60) }, 296 { "hz", 0 }, 297 { NULL, 0 } 298 }; 299 300 if (desc == NULL) { 301 char n[PROF_NAMELEN]; 302 303 /* 304 * If no description was provided, provide all of our probes. 305 */ 306 for (i = 0; i < (int)(sizeof (profile_rates) / sizeof (int)); i++) { 307 if ((rate = profile_rates[i]) == 0) 308 continue; 309 310 (void) snprintf(n, PROF_NAMELEN, "%s%d", 311 PROF_PREFIX_PROFILE, rate); 312 profile_create(NANOSEC / rate, n, PROF_PROFILE); 313 } 314 315 for (i = 0; i < (int)(sizeof (profile_ticks) / sizeof (int)); i++) { 316 if ((rate = profile_ticks[i]) == 0) 317 continue; 318 319 (void) snprintf(n, PROF_NAMELEN, "%s%d", 320 PROF_PREFIX_TICK, rate); 321 profile_create(NANOSEC / rate, n, PROF_TICK); 322 } 323 324 return; 325 } 326 327 name = desc->dtpd_name; 328 329 for (i = 0; types[i].prefix != NULL; i++) { 330 len = strlen(types[i].prefix); 331 332 if (strncmp(name, types[i].prefix, len) != 0) 333 continue; 334 break; 335 } 336 337 if (types[i].prefix == NULL) 338 return; 339 340 kind = types[i].kind; 341 j = strlen(name) - len; 342 343 /* 344 * We need to start before any time suffix. 345 */ 346 for (j = strlen(name); j >= len; j--) { 347 if (name[j] >= '0' && name[j] <= '9') 348 break; 349 suffix = &name[j]; 350 } 351 352 ASSERT(suffix != NULL); 353 354 /* 355 * Now determine the numerical value present in the probe name. 356 */ 357 for (; j >= len; j--) { 358 if (name[j] < '0' || name[j] > '9') 359 return; 360 361 val += (name[j] - '0') * mult; 362 mult *= (hrtime_t)10; 363 } 364 365 if (val == 0) 366 return; 367 368 /* 369 * Look-up the suffix to determine the multiplier. 370 */ 371 for (i = 0, mult = 0; suffixes[i].name != NULL; i++) { 372 /* APPLE NOTE: Darwin employs size bounded string operations */ 373 if (strncasecmp(suffixes[i].name, suffix, strlen(suffixes[i].name) + 1) == 0) { 374 mult = suffixes[i].mult; 375 break; 376 } 377 } 378 379 if (suffixes[i].name == NULL && *suffix != '\0') 380 return; 381 382 if (mult == 0) { 383 /* 384 * The default is frequency-per-second. 385 */ 386 val = NANOSEC / val; 387 } else { 388 val *= mult; 389 } 390 391 profile_create(val, name, kind); 392} 393 394/*ARGSUSED*/ 395static void 396profile_destroy(void *arg, dtrace_id_t id, void *parg) 397{ 398#pragma unused(arg,id) /* __APPLE__ */ 399 profile_probe_t *prof = parg; 400 401 ASSERT(prof->prof_cyclic == CYCLIC_NONE); 402 403 if (prof->prof_kind == PROF_TICK) 404 kmem_free(prof, sizeof (profile_probe_t)); 405 else 406 kmem_free(prof, sizeof (profile_probe_t) + NCPU*sizeof(profile_probe_percpu_t)); 407 408 ASSERT(profile_total >= 1); 409 atomic_add_32(&profile_total, -1); 410} 411 412/*ARGSUSED*/ 413static void 414profile_online(void *arg, dtrace_cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when) 415{ 416#pragma unused(cpu) /* __APPLE__ */ 417 profile_probe_t *prof = arg; 418 profile_probe_percpu_t *pcpu; 419 420 pcpu = ((profile_probe_percpu_t *)(&(prof[1]))) + cpu_number(); 421 pcpu->profc_probe = prof; 422 423 hdlr->cyh_func = profile_fire; 424 hdlr->cyh_arg = pcpu; 425 hdlr->cyh_level = CY_HIGH_LEVEL; 426 427 when->cyt_interval = prof->prof_interval; 428 when->cyt_when = dtrace_gethrtime() + when->cyt_interval; 429 430 pcpu->profc_expected = when->cyt_when; 431 pcpu->profc_interval = when->cyt_interval; 432} 433 434/*ARGSUSED*/ 435static void 436profile_offline(void *arg, dtrace_cpu_t *cpu, void *oarg) 437{ 438 profile_probe_percpu_t *pcpu = oarg; 439 440 ASSERT(pcpu->profc_probe == arg); 441#pragma unused(pcpu,arg,cpu) /* __APPLE__ */ 442} 443 444/*ARGSUSED*/ 445static int 446profile_enable(void *arg, dtrace_id_t id, void *parg) 447{ 448#pragma unused(arg,id) /* __APPLE__ */ 449 profile_probe_t *prof = parg; 450 cyc_omni_handler_t omni; 451 cyc_handler_t hdlr; 452 cyc_time_t when; 453 454 ASSERT(prof->prof_interval != 0); 455 ASSERT(MUTEX_HELD(&cpu_lock)); 456 457 if (prof->prof_kind == PROF_TICK) { 458 hdlr.cyh_func = profile_tick; 459 hdlr.cyh_arg = prof; 460 hdlr.cyh_level = CY_HIGH_LEVEL; 461 462 when.cyt_interval = prof->prof_interval; 463#if !defined(__APPLE__) 464 when.cyt_when = dtrace_gethrtime() + when.cyt_interval; 465#else 466 when.cyt_when = 0; 467#endif /* __APPLE__ */ 468 } else { 469 ASSERT(prof->prof_kind == PROF_PROFILE); 470 omni.cyo_online = profile_online; 471 omni.cyo_offline = profile_offline; 472 omni.cyo_arg = prof; 473 } 474 475 if (prof->prof_kind == PROF_TICK) { 476 prof->prof_cyclic = cyclic_timer_add(&hdlr, &when); 477 } else { 478 prof->prof_cyclic = (cyclic_id_t)cyclic_add_omni(&omni); /* cast puns cyclic_id_list_t with cyclic_id_t */ 479 } 480 481 return(0); 482} 483 484/*ARGSUSED*/ 485static void 486profile_disable(void *arg, dtrace_id_t id, void *parg) 487{ 488 profile_probe_t *prof = parg; 489 490 ASSERT(prof->prof_cyclic != CYCLIC_NONE); 491 ASSERT(MUTEX_HELD(&cpu_lock)); 492 493#pragma unused(arg,id) 494 if (prof->prof_kind == PROF_TICK) { 495 cyclic_timer_remove(prof->prof_cyclic); 496 } else { 497 cyclic_remove_omni((cyclic_id_list_t)prof->prof_cyclic); /* cast puns cyclic_id_list_t with cyclic_id_t */ 498 } 499 prof->prof_cyclic = CYCLIC_NONE; 500} 501 502/* 503 * APPLE NOTE: profile_usermode call not supported. 504 */ 505static int 506profile_usermode(void *arg, dtrace_id_t id, void *parg) 507{ 508#pragma unused(arg,id,parg) 509 return 1; /* XXX_BOGUS */ 510} 511 512static dtrace_pattr_t profile_attr = { 513{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, 514{ DTRACE_STABILITY_UNSTABLE, DTRACE_STABILITY_UNSTABLE, DTRACE_CLASS_UNKNOWN }, 515{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, 516{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, 517{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, 518}; 519 520static dtrace_pops_t profile_pops = { 521 profile_provide, 522 NULL, 523 profile_enable, 524 profile_disable, 525 NULL, 526 NULL, 527 NULL, 528 NULL, 529 profile_usermode, 530 profile_destroy 531}; 532 533static int 534profile_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) 535{ 536 switch (cmd) { 537 case DDI_ATTACH: 538 break; 539 case DDI_RESUME: 540 return (DDI_SUCCESS); 541 default: 542 return (DDI_FAILURE); 543 } 544 545 if (ddi_create_minor_node(devi, "profile", S_IFCHR, 0, 546 DDI_PSEUDO, 0) == DDI_FAILURE || 547 dtrace_register("profile", &profile_attr, 548 DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER, NULL, 549 &profile_pops, NULL, &profile_id) != 0) { 550 ddi_remove_minor_node(devi, NULL); 551 return (DDI_FAILURE); 552 } 553 554 profile_max = PROFILE_MAX_DEFAULT; 555 556 ddi_report_dev(devi); 557 profile_devi = devi; 558 return (DDI_SUCCESS); 559} 560 561/* 562 * APPLE NOTE: profile_detach not implemented 563 */ 564#if !defined(__APPLE__) 565static int 566profile_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) 567{ 568 switch (cmd) { 569 case DDI_DETACH: 570 break; 571 case DDI_SUSPEND: 572 return (DDI_SUCCESS); 573 default: 574 return (DDI_FAILURE); 575 } 576 577 if (dtrace_unregister(profile_id) != 0) 578 return (DDI_FAILURE); 579 580 ddi_remove_minor_node(devi, NULL); 581 return (DDI_SUCCESS); 582} 583#endif /* __APPLE__ */ 584 585d_open_t _profile_open; 586 587int _profile_open(dev_t dev, int flags, int devtype, struct proc *p) 588{ 589#pragma unused(dev,flags,devtype,p) 590 return 0; 591} 592 593#define PROFILE_MAJOR -24 /* let the kernel pick the device number */ 594 595/* 596 * A struct describing which functions will get invoked for certain 597 * actions. 598 */ 599static struct cdevsw profile_cdevsw = 600{ 601 _profile_open, /* open */ 602 eno_opcl, /* close */ 603 eno_rdwrt, /* read */ 604 eno_rdwrt, /* write */ 605 eno_ioctl, /* ioctl */ 606 (stop_fcn_t *)nulldev, /* stop */ 607 (reset_fcn_t *)nulldev, /* reset */ 608 NULL, /* tty's */ 609 eno_select, /* select */ 610 eno_mmap, /* mmap */ 611 eno_strat, /* strategy */ 612 eno_getc, /* getc */ 613 eno_putc, /* putc */ 614 0 /* type */ 615}; 616 617static int gProfileInited = 0; 618 619void profile_init( void ) 620{ 621 if (0 == gProfileInited) 622 { 623 int majdevno = cdevsw_add(PROFILE_MAJOR, &profile_cdevsw); 624 625 if (majdevno < 0) { 626 printf("profile_init: failed to allocate a major number!\n"); 627 gProfileInited = 0; 628 return; 629 } 630 631 profile_attach( (dev_info_t *)(uintptr_t)majdevno, DDI_ATTACH ); 632 633 gProfileInited = 1; 634 } else 635 panic("profile_init: called twice!\n"); 636} 637#undef PROFILE_MAJOR 638