kmp_stats.h revision 345153
1#ifndef KMP_STATS_H 2#define KMP_STATS_H 3 4/** @file kmp_stats.h 5 * Functions for collecting statistics. 6 */ 7 8//===----------------------------------------------------------------------===// 9// 10// The LLVM Compiler Infrastructure 11// 12// This file is dual licensed under the MIT and the University of Illinois Open 13// Source Licenses. See LICENSE.txt for details. 14// 15//===----------------------------------------------------------------------===// 16 17#include "kmp_config.h" 18#include "kmp_debug.h" 19 20#if KMP_STATS_ENABLED 21/* Statistics accumulator. 22 Accumulates number of samples and computes min, max, mean, standard deviation 23 on the fly. 24 25 Online variance calculation algorithm from 26 http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm 27 */ 28 29#include "kmp_stats_timing.h" 30#include <limits> 31#include <math.h> 32#include <new> // placement new 33#include <stdint.h> 34#include <string> 35#include <vector> 36 37/* Enable developer statistics here if you want them. They are more detailed 38 than is useful for application characterisation and are intended for the 39 runtime library developer. */ 40#define KMP_DEVELOPER_STATS 0 41 42/* Enable/Disable histogram output */ 43#define KMP_STATS_HIST 0 44 45/*! 46 * @ingroup STATS_GATHERING 47 * \brief flags to describe the statistic (timer or counter) 48 * 49 */ 50enum stats_flags_e { 51 noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic 52 onlyInMaster = 1 << 1, //!< statistic is valid only for master 53 noUnits = 1 << 2, //!< statistic doesn't need units printed next to it 54 notInMaster = 1 << 3, //!< statistic is valid only for non-master threads 55 logEvent = 1 << 4 //!< statistic can be logged on the event timeline when 56 //! KMP_STATS_EVENTS is on (valid only for timers) 57}; 58 59/*! 60 * @ingroup STATS_GATHERING 61 * \brief the states which a thread can be in 62 * 63 */ 64enum stats_state_e { 65 IDLE, 66 SERIAL_REGION, 67 FORK_JOIN_BARRIER, 68 PLAIN_BARRIER, 69 TASKWAIT, 70 TASKYIELD, 71 TASKGROUP, 72 IMPLICIT_TASK, 73 EXPLICIT_TASK 74}; 75 76/*! 77 * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h 78 * 79 * @param macro a user defined macro that takes three arguments - 80 * macro(COUNTER_NAME, flags, arg) 81 * @param arg a user defined argument to send to the user defined macro 82 * 83 * \details A counter counts the occurrence of some event. Each thread 84 * accumulates its own count, at the end of execution the counts are aggregated 85 * treating each thread as a separate measurement. (Unless onlyInMaster is set, 86 * in which case there's only a single measurement). The min,mean,max are 87 * therefore the values for the threads. Adding the counter here and then 88 * putting a KMP_BLOCK_COUNTER(name) at the point you want to count is all you 89 * need to do. All of the tables and printing is generated from this macro. 90 * Format is "macro(name, flags, arg)" 91 * 92 * @ingroup STATS_GATHERING 93 */ 94// clang-format off 95#define KMP_FOREACH_COUNTER(macro, arg) \ 96 macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg) \ 97 macro(OMP_NESTED_PARALLEL, 0, arg) \ 98 macro(OMP_LOOP_STATIC, 0, arg) \ 99 macro(OMP_LOOP_STATIC_STEAL, 0, arg) \ 100 macro(OMP_LOOP_DYNAMIC, 0, arg) \ 101 macro(OMP_DISTRIBUTE, 0, arg) \ 102 macro(OMP_BARRIER, 0, arg) \ 103 macro(OMP_CRITICAL, 0, arg) \ 104 macro(OMP_SINGLE, 0, arg) \ 105 macro(OMP_MASTER, 0, arg) \ 106 macro(OMP_TEAMS, 0, arg) \ 107 macro(OMP_set_lock, 0, arg) \ 108 macro(OMP_test_lock, 0, arg) \ 109 macro(REDUCE_wait, 0, arg) \ 110 macro(REDUCE_nowait, 0, arg) \ 111 macro(OMP_TASKYIELD, 0, arg) \ 112 macro(OMP_TASKLOOP, 0, arg) \ 113 macro(TASK_executed, 0, arg) \ 114 macro(TASK_cancelled, 0, arg) \ 115 macro(TASK_stolen, 0, arg) 116// clang-format on 117 118/*! 119 * \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h 120 * 121 * @param macro a user defined macro that takes three arguments - 122 * macro(TIMER_NAME, flags, arg) 123 * @param arg a user defined argument to send to the user defined macro 124 * 125 * \details A timer collects multiple samples of some count in each thread and 126 * then finally aggregates all of the samples from all of the threads. For most 127 * timers the printing code also provides an aggregation over the thread totals. 128 * These are printed as TOTAL_foo. The count is normally a time (in ticks), 129 * hence the name "timer". (But can be any value, so we use this for "number of 130 * arguments passed to fork" as well). For timers the threads are not 131 * significant, it's the individual observations that count, so the statistics 132 * are at that level. Format is "macro(name, flags, arg)" 133 * 134 * @ingroup STATS_GATHERING2 135 */ 136// clang-format off 137#define KMP_FOREACH_TIMER(macro, arg) \ 138 macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \ 139 macro (OMP_parallel, stats_flags_e::logEvent, arg) \ 140 macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg) \ 141 macro (OMP_loop_static, 0, arg) \ 142 macro (OMP_loop_static_scheduling, 0, arg) \ 143 macro (OMP_loop_dynamic, 0, arg) \ 144 macro (OMP_loop_dynamic_scheduling, 0, arg) \ 145 macro (OMP_critical, 0, arg) \ 146 macro (OMP_critical_wait, 0, arg) \ 147 macro (OMP_single, 0, arg) \ 148 macro (OMP_master, 0, arg) \ 149 macro (OMP_task_immediate, 0, arg) \ 150 macro (OMP_task_taskwait, 0, arg) \ 151 macro (OMP_task_taskyield, 0, arg) \ 152 macro (OMP_task_taskgroup, 0, arg) \ 153 macro (OMP_task_join_bar, 0, arg) \ 154 macro (OMP_task_plain_bar, 0, arg) \ 155 macro (OMP_taskloop_scheduling, 0, arg) \ 156 macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \ 157 macro (OMP_idle, stats_flags_e::logEvent, arg) \ 158 macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \ 159 macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \ 160 macro (OMP_serial, stats_flags_e::logEvent, arg) \ 161 macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, \ 162 arg) \ 163 macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \ 164 arg) \ 165 macro (OMP_loop_static_iterations, \ 166 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 167 macro (OMP_loop_dynamic_iterations, \ 168 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 169 KMP_FOREACH_DEVELOPER_TIMER(macro, arg) 170// clang-format on 171 172// OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either 173// initializing OpenMP or being created by a master) 174// until the thread is destroyed 175// OMP_parallel -- Time thread spends executing work directly 176// within a #pragma omp parallel 177// OMP_parallel_overhead -- Time thread spends setting up a parallel region 178// OMP_loop_static -- Time thread spends executing loop iterations from 179// a statically scheduled loop 180// OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations 181// from a statically scheduled loop 182// OMP_loop_dynamic -- Time thread spends executing loop iterations from 183// a dynamically scheduled loop 184// OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations 185// from a dynamically scheduled loop 186// OMP_critical -- Time thread spends executing critical section 187// OMP_critical_wait -- Time thread spends waiting to enter 188// a critcal seciton 189// OMP_single -- Time spent executing a "single" region 190// OMP_master -- Time spent executing a "master" region 191// OMP_task_immediate -- Time spent executing non-deferred tasks 192// OMP_task_taskwait -- Time spent executing tasks inside a taskwait 193// construct 194// OMP_task_taskyield -- Time spent executing tasks inside a taskyield 195// construct 196// OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup 197// construct 198// OMP_task_join_bar -- Time spent executing tasks inside a join barrier 199// OMP_task_plain_bar -- Time spent executing tasks inside a barrier 200// construct 201// OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop 202// construct 203// OMP_plain_barrier -- Time spent in a #pragma omp barrier construct or 204// inside implicit barrier at end of worksharing 205// construct 206// OMP_idle -- Time worker threads spend waiting for next 207// parallel region 208// OMP_fork_barrier -- Time spent in a the fork barrier surrounding a 209// parallel region 210// OMP_join_barrier -- Time spent in a the join barrier surrounding a 211// parallel region 212// OMP_serial -- Time thread zero spends executing serial code 213// OMP_set_numthreads -- Values passed to omp_set_num_threads 214// OMP_PARALLEL_args -- Number of arguments passed to a parallel region 215// OMP_loop_static_iterations -- Number of iterations thread is assigned for 216// statically scheduled loops 217// OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for 218// dynamically scheduled loops 219 220#if (KMP_DEVELOPER_STATS) 221// Timers which are of interest to runtime library developers, not end users. 222// These have to be explicitly enabled in addition to the other stats. 223 224// KMP_fork_barrier -- time in __kmp_fork_barrier 225// KMP_join_barrier -- time in __kmp_join_barrier 226// KMP_barrier -- time in __kmp_barrier 227// KMP_end_split_barrier -- time in __kmp_end_split_barrier 228// KMP_setup_icv_copy -- time in __kmp_setup_icv_copy 229// KMP_icv_copy -- start/stop timer for any ICV copying 230// KMP_linear_gather -- time in __kmp_linear_barrier_gather 231// KMP_linear_release -- time in __kmp_linear_barrier_release 232// KMP_tree_gather -- time in __kmp_tree_barrier_gather 233// KMP_tree_release -- time in __kmp_tree_barrier_release 234// KMP_hyper_gather -- time in __kmp_hyper_barrier_gather 235// KMP_hyper_release -- time in __kmp_hyper_barrier_release 236// clang-format off 237#define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \ 238 macro(KMP_fork_call, 0, arg) \ 239 macro(KMP_join_call, 0, arg) \ 240 macro(KMP_end_split_barrier, 0, arg) \ 241 macro(KMP_hier_gather, 0, arg) \ 242 macro(KMP_hier_release, 0, arg) \ 243 macro(KMP_hyper_gather, 0, arg) \ 244 macro(KMP_hyper_release, 0, arg) \ 245 macro(KMP_linear_gather, 0, arg) \ 246 macro(KMP_linear_release, 0, arg) \ 247 macro(KMP_tree_gather, 0, arg) \ 248 macro(KMP_tree_release, 0, arg) \ 249 macro(USER_resume, 0, arg) \ 250 macro(USER_suspend, 0, arg) \ 251 macro(KMP_allocate_team, 0, arg) \ 252 macro(KMP_setup_icv_copy, 0, arg) \ 253 macro(USER_icv_copy, 0, arg) \ 254 macro (FOR_static_steal_stolen, \ 255 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 256 macro (FOR_static_steal_chunks, \ 257 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) 258#else 259#define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) 260#endif 261// clang-format on 262 263/*! 264 * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro. 265 * 266 * @param macro a user defined macro that takes three arguments - 267 * macro(TIMER_NAME, flags, arg) 268 * @param arg a user defined argument to send to the user defined macro 269 * 270 * \warning YOU MUST HAVE THE SAME NAMED TIMER UNDER KMP_FOREACH_TIMER() OR ELSE 271 * BAD THINGS WILL HAPPEN! 272 * 273 * \details Explicit timers are ones where we need to allocate a timer itself 274 * (as well as the accumulated timing statistics). We allocate these on a 275 * per-thread basis, and explicitly start and stop them. Block timers just 276 * allocate the timer itself on the stack, and use the destructor to notice 277 * block exit; they don't need to be defined here. The name here should be the 278 * same as that of a timer above. 279 * 280 * @ingroup STATS_GATHERING 281*/ 282#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg) 283 284#define ENUMERATE(name, ignore, prefix) prefix##name, 285enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST }; 286 287enum explicit_timer_e { 288 KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST 289}; 290 291enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST }; 292#undef ENUMERATE 293 294/* 295 * A logarithmic histogram. It accumulates the number of values in each power of 296 * ten bin. So 1<=x<10, 10<=x<100, ... 297 * Mostly useful where we have some big outliers and want to see information 298 * about them. 299 */ 300class logHistogram { 301 enum { 302 numBins = 31, /* Number of powers of 10. If this changes you need to change 303 * the initializer for binMax */ 304 305 /* 306 * If you want to use this to analyse values that may be less than 1, (for 307 * instance times in s), then the logOffset gives you negative powers. 308 * In our case here, we're just looking at times in ticks, or counts, so we 309 * can never see values with magnitude < 1 (other than zero), so we can set 310 * it to 0. As above change the initializer if you change this. 311 */ 312 logOffset = 0 313 }; 314 uint32_t KMP_ALIGN_CACHE zeroCount; 315 struct { 316 uint32_t count; 317 double total; 318 } bins[numBins]; 319 320 static double binMax[numBins]; 321 322#ifdef KMP_DEBUG 323 uint64_t _total; 324 325 void check() const { 326 uint64_t t = zeroCount; 327 for (int i = 0; i < numBins; i++) 328 t += bins[i].count; 329 KMP_DEBUG_ASSERT(t == _total); 330 } 331#else 332 void check() const {} 333#endif 334 335public: 336 logHistogram() { reset(); } 337 338 logHistogram(logHistogram const &o) { 339 for (int i = 0; i < numBins; i++) 340 bins[i] = o.bins[i]; 341#ifdef KMP_DEBUG 342 _total = o._total; 343#endif 344 } 345 346 void reset() { 347 zeroCount = 0; 348 for (int i = 0; i < numBins; i++) { 349 bins[i].count = 0; 350 bins[i].total = 0; 351 } 352 353#ifdef KMP_DEBUG 354 _total = 0; 355#endif 356 } 357 uint32_t count(int b) const { return bins[b + logOffset].count; } 358 double total(int b) const { return bins[b + logOffset].total; } 359 static uint32_t findBin(double sample); 360 361 logHistogram &operator+=(logHistogram const &o) { 362 zeroCount += o.zeroCount; 363 for (int i = 0; i < numBins; i++) { 364 bins[i].count += o.bins[i].count; 365 bins[i].total += o.bins[i].total; 366 } 367#ifdef KMP_DEBUG 368 _total += o._total; 369 check(); 370#endif 371 372 return *this; 373 } 374 375 void addSample(double sample); 376 int minBin() const; 377 int maxBin() const; 378 379 std::string format(char) const; 380}; 381 382class statistic { 383 double KMP_ALIGN_CACHE minVal; 384 double maxVal; 385 double meanVal; 386 double m2; 387 uint64_t sampleCount; 388 double offset; 389 bool collectingHist; 390 logHistogram hist; 391 392public: 393 statistic(bool doHist = bool(KMP_STATS_HIST)) { 394 reset(); 395 collectingHist = doHist; 396 } 397 statistic(statistic const &o) 398 : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2), 399 sampleCount(o.sampleCount), offset(o.offset), 400 collectingHist(o.collectingHist), hist(o.hist) {} 401 statistic(double minv, double maxv, double meanv, uint64_t sc, double sd) 402 : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc), 403 sampleCount(sc), offset(0.0), collectingHist(false) {} 404 bool haveHist() const { return collectingHist; } 405 double getMin() const { return minVal; } 406 double getMean() const { return meanVal; } 407 double getMax() const { return maxVal; } 408 uint64_t getCount() const { return sampleCount; } 409 double getSD() const { return sqrt(m2 / sampleCount); } 410 double getTotal() const { return sampleCount * meanVal; } 411 logHistogram const *getHist() const { return &hist; } 412 void setOffset(double d) { offset = d; } 413 414 void reset() { 415 minVal = std::numeric_limits<double>::max(); 416 maxVal = -minVal; 417 meanVal = 0.0; 418 m2 = 0.0; 419 sampleCount = 0; 420 offset = 0.0; 421 hist.reset(); 422 } 423 void addSample(double sample); 424 void scale(double factor); 425 void scaleDown(double f) { scale(1. / f); } 426 void forceCount(uint64_t count) { sampleCount = count; } 427 statistic &operator+=(statistic const &other); 428 429 std::string format(char unit, bool total = false) const; 430 std::string formatHist(char unit) const { return hist.format(unit); } 431}; 432 433struct statInfo { 434 const char *name; 435 uint32_t flags; 436}; 437 438class timeStat : public statistic { 439 static statInfo timerInfo[]; 440 441public: 442 timeStat() : statistic() {} 443 static const char *name(timer_e e) { return timerInfo[e].name; } 444 static bool noTotal(timer_e e) { 445 return timerInfo[e].flags & stats_flags_e::noTotal; 446 } 447 static bool masterOnly(timer_e e) { 448 return timerInfo[e].flags & stats_flags_e::onlyInMaster; 449 } 450 static bool workerOnly(timer_e e) { 451 return timerInfo[e].flags & stats_flags_e::notInMaster; 452 } 453 static bool noUnits(timer_e e) { 454 return timerInfo[e].flags & stats_flags_e::noUnits; 455 } 456 static bool logEvent(timer_e e) { 457 return timerInfo[e].flags & stats_flags_e::logEvent; 458 } 459 static void clearEventFlags() { 460 for (int i = 0; i < TIMER_LAST; i++) { 461 timerInfo[i].flags &= (~(stats_flags_e::logEvent)); 462 } 463 } 464}; 465 466// Where we need explicitly to start and end the timer, this version can be used 467// Since these timers normally aren't nicely scoped, so don't have a good place 468// to live on the stack of the thread, they're more work to use. 469class explicitTimer { 470 timeStat *stat; 471 timer_e timerEnumValue; 472 tsc_tick_count startTime; 473 tsc_tick_count pauseStartTime; 474 tsc_tick_count::tsc_interval_t totalPauseTime; 475 476public: 477 explicitTimer(timeStat *s, timer_e te) 478 : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0), 479 totalPauseTime() {} 480 481 // void setStat(timeStat *s) { stat = s; } 482 void start(tsc_tick_count tick); 483 void pause(tsc_tick_count tick) { pauseStartTime = tick; } 484 void resume(tsc_tick_count tick) { 485 totalPauseTime += (tick - pauseStartTime); 486 } 487 void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr); 488 void reset() { 489 startTime = 0; 490 pauseStartTime = 0; 491 totalPauseTime = 0; 492 } 493 timer_e get_type() const { return timerEnumValue; } 494}; 495 496// Where you need to partition a threads clock ticks into separate states 497// e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and 498// DOING_NOTHING would render these conditions: 499// time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive 500// No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice 501// versa 502class partitionedTimers { 503private: 504 std::vector<explicitTimer> timer_stack; 505 506public: 507 partitionedTimers(); 508 void init(explicitTimer timer); 509 void exchange(explicitTimer timer); 510 void push(explicitTimer timer); 511 void pop(); 512 void windup(); 513}; 514 515// Special wrapper around the partioned timers to aid timing code blocks 516// It avoids the need to have an explicit end, leaving the scope suffices. 517class blockPartitionedTimer { 518 partitionedTimers *part_timers; 519 520public: 521 blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer) 522 : part_timers(pt) { 523 part_timers->push(timer); 524 } 525 ~blockPartitionedTimer() { part_timers->pop(); } 526}; 527 528// Special wrapper around the thread state to aid in keeping state in code 529// blocks It avoids the need to have an explicit end, leaving the scope 530// suffices. 531class blockThreadState { 532 stats_state_e *state_pointer; 533 stats_state_e old_state; 534 535public: 536 blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state) 537 : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) { 538 *state_pointer = new_state; 539 } 540 ~blockThreadState() { *state_pointer = old_state; } 541}; 542 543// If all you want is a count, then you can use this... 544// The individual per-thread counts will be aggregated into a statistic at 545// program exit. 546class counter { 547 uint64_t value; 548 static const statInfo counterInfo[]; 549 550public: 551 counter() : value(0) {} 552 void increment() { value++; } 553 uint64_t getValue() const { return value; } 554 void reset() { value = 0; } 555 static const char *name(counter_e e) { return counterInfo[e].name; } 556 static bool masterOnly(counter_e e) { 557 return counterInfo[e].flags & stats_flags_e::onlyInMaster; 558 } 559}; 560 561/* **************************************************************** 562 Class to implement an event 563 564 There are four components to an event: start time, stop time 565 nest_level, and timer_name. 566 The start and stop time should be obvious (recorded in clock ticks). 567 The nest_level relates to the bar width in the timeline graph. 568 The timer_name is used to determine which timer event triggered this event. 569 570 the interface to this class is through four read-only operations: 571 1) getStart() -- returns the start time as 64 bit integer 572 2) getStop() -- returns the stop time as 64 bit integer 573 3) getNestLevel() -- returns the nest level of the event 574 4) getTimerName() -- returns the timer name that triggered event 575 576 *MORE ON NEST_LEVEL* 577 The nest level is used in the bar graph that represents the timeline. 578 Its main purpose is for showing how events are nested inside eachother. 579 For example, say events, A, B, and C are recorded. If the timeline 580 looks like this: 581 582Begin -------------------------------------------------------------> Time 583 | | | | | | 584 A B C C B A 585 start start start end end end 586 587 Then A, B, C will have a nest level of 1, 2, 3 respectively. 588 These values are then used to calculate the barwidth so you can 589 see that inside A, B has occurred, and inside B, C has occurred. 590 Currently, this is shown with A's bar width being larger than B's 591 bar width, and B's bar width being larger than C's bar width. 592 593**************************************************************** */ 594class kmp_stats_event { 595 uint64_t start; 596 uint64_t stop; 597 int nest_level; 598 timer_e timer_name; 599 600public: 601 kmp_stats_event() 602 : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {} 603 kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme) 604 : start(strt), stop(stp), nest_level(nst), timer_name(nme) {} 605 inline uint64_t getStart() const { return start; } 606 inline uint64_t getStop() const { return stop; } 607 inline int getNestLevel() const { return nest_level; } 608 inline timer_e getTimerName() const { return timer_name; } 609}; 610 611/* **************************************************************** 612 Class to implement a dynamically expandable array of events 613 614 --------------------------------------------------------- 615 | event 1 | event 2 | event 3 | event 4 | ... | event N | 616 --------------------------------------------------------- 617 618 An event is pushed onto the back of this array at every 619 explicitTimer->stop() call. The event records the thread #, 620 start time, stop time, and nest level related to the bar width. 621 622 The event vector starts at size INIT_SIZE and grows (doubles in size) 623 if needed. An implication of this behavior is that log(N) 624 reallocations are needed (where N is number of events). If you want 625 to avoid reallocations, then set INIT_SIZE to a large value. 626 627 the interface to this class is through six operations: 628 1) reset() -- sets the internal_size back to 0 but does not deallocate any 629 memory 630 2) size() -- returns the number of valid elements in the vector 631 3) push_back(start, stop, nest, timer_name) -- pushes an event onto 632 the back of the array 633 4) deallocate() -- frees all memory associated with the vector 634 5) sort() -- sorts the vector by start time 635 6) operator[index] or at(index) -- returns event reference at that index 636**************************************************************** */ 637class kmp_stats_event_vector { 638 kmp_stats_event *events; 639 int internal_size; 640 int allocated_size; 641 static const int INIT_SIZE = 1024; 642 643public: 644 kmp_stats_event_vector() { 645 events = 646 (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE); 647 internal_size = 0; 648 allocated_size = INIT_SIZE; 649 } 650 ~kmp_stats_event_vector() {} 651 inline void reset() { internal_size = 0; } 652 inline int size() const { return internal_size; } 653 void push_back(uint64_t start_time, uint64_t stop_time, int nest_level, 654 timer_e name) { 655 int i; 656 if (internal_size == allocated_size) { 657 kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate( 658 sizeof(kmp_stats_event) * allocated_size * 2); 659 for (i = 0; i < internal_size; i++) 660 tmp[i] = events[i]; 661 __kmp_free(events); 662 events = tmp; 663 allocated_size *= 2; 664 } 665 events[internal_size] = 666 kmp_stats_event(start_time, stop_time, nest_level, name); 667 internal_size++; 668 return; 669 } 670 void deallocate(); 671 void sort(); 672 const kmp_stats_event &operator[](int index) const { return events[index]; } 673 kmp_stats_event &operator[](int index) { return events[index]; } 674 const kmp_stats_event &at(int index) const { return events[index]; } 675 kmp_stats_event &at(int index) { return events[index]; } 676}; 677 678/* **************************************************************** 679 Class to implement a doubly-linked, circular, statistics list 680 681 |---| ---> |---| ---> |---| ---> |---| ---> ... next 682 | | | | | | | | 683 |---| <--- |---| <--- |---| <--- |---| <--- ... prev 684 Sentinel first second third 685 Node node node node 686 687 The Sentinel Node is the user handle on the list. 688 The first node corresponds to thread 0's statistics. 689 The second node corresponds to thread 1's statistics and so on... 690 691 Each node has a _timers, _counters, and _explicitTimers array to hold that 692 thread's statistics. The _explicitTimers point to the correct _timer and 693 update its statistics at every stop() call. The explicitTimers' pointers are 694 set up in the constructor. Each node also has an event vector to hold that 695 thread's timing events. The event vector expands as necessary and records 696 the start-stop times for each timer. 697 698 The nestLevel variable is for plotting events and is related 699 to the bar width in the timeline graph. 700 701 Every thread will have a thread local pointer to its node in 702 the list. The sentinel node is used by the master thread to 703 store "dummy" statistics before __kmp_create_worker() is called. 704**************************************************************** */ 705class kmp_stats_list { 706 int gtid; 707 timeStat _timers[TIMER_LAST + 1]; 708 counter _counters[COUNTER_LAST + 1]; 709 explicitTimer thread_life_timer; 710 partitionedTimers _partitionedTimers; 711 int _nestLevel; // one per thread 712 kmp_stats_event_vector _event_vector; 713 kmp_stats_list *next; 714 kmp_stats_list *prev; 715 stats_state_e state; 716 int thread_is_idle_flag; 717 718public: 719 kmp_stats_list() 720 : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life], 721 TIMER_OMP_worker_thread_life), 722 _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE), 723 thread_is_idle_flag(0) {} 724 ~kmp_stats_list() {} 725 inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; } 726 inline counter *getCounter(counter_e idx) { return &_counters[idx]; } 727 inline partitionedTimers *getPartitionedTimers() { 728 return &_partitionedTimers; 729 } 730 inline timeStat *getTimers() { return _timers; } 731 inline counter *getCounters() { return _counters; } 732 inline kmp_stats_event_vector &getEventVector() { return _event_vector; } 733 inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); } 734 inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); } 735 inline void resetEventVector() { _event_vector.reset(); } 736 inline void incrementNestValue() { _nestLevel++; } 737 inline int getNestValue() { return _nestLevel; } 738 inline void decrementNestValue() { _nestLevel--; } 739 inline int getGtid() const { return gtid; } 740 inline void setGtid(int newgtid) { gtid = newgtid; } 741 inline void setState(stats_state_e newstate) { state = newstate; } 742 inline stats_state_e getState() const { return state; } 743 inline stats_state_e *getStatePointer() { return &state; } 744 inline bool isIdle() { return thread_is_idle_flag == 1; } 745 inline void setIdleFlag() { thread_is_idle_flag = 1; } 746 inline void resetIdleFlag() { thread_is_idle_flag = 0; } 747 kmp_stats_list *push_back(int gtid); // returns newly created list node 748 inline void push_event(uint64_t start_time, uint64_t stop_time, 749 int nest_level, timer_e name) { 750 _event_vector.push_back(start_time, stop_time, nest_level, name); 751 } 752 void deallocate(); 753 class iterator; 754 kmp_stats_list::iterator begin(); 755 kmp_stats_list::iterator end(); 756 int size(); 757 class iterator { 758 kmp_stats_list *ptr; 759 friend kmp_stats_list::iterator kmp_stats_list::begin(); 760 friend kmp_stats_list::iterator kmp_stats_list::end(); 761 762 public: 763 iterator(); 764 ~iterator(); 765 iterator operator++(); 766 iterator operator++(int dummy); 767 iterator operator--(); 768 iterator operator--(int dummy); 769 bool operator!=(const iterator &rhs); 770 bool operator==(const iterator &rhs); 771 kmp_stats_list *operator*() const; // dereference operator 772 }; 773}; 774 775/* **************************************************************** 776 Class to encapsulate all output functions and the environment variables 777 778 This module holds filenames for various outputs (normal stats, events, plot 779 file), as well as coloring information for the plot file. 780 781 The filenames and flags variables are read from environment variables. 782 These are read once by the constructor of the global variable 783 __kmp_stats_output which calls init(). 784 785 During this init() call, event flags for the timeStat::timerInfo[] global 786 array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes). 787 788 The only interface function that is public is outputStats(heading). This 789 function should print out everything it needs to, either to files or stderr, 790 depending on the environment variables described below 791 792 ENVIRONMENT VARIABLES: 793 KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this 794 file, otherwise, print to stderr 795 KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to 796 either KMP_STATS_FILE or stderr 797 KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename, 798 otherwise, the plot file is sent to "events.plt" 799 KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log 800 events 801 KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file, 802 otherwise, output is sent to "events.dat" 803**************************************************************** */ 804class kmp_stats_output_module { 805 806public: 807 struct rgb_color { 808 float r; 809 float g; 810 float b; 811 }; 812 813private: 814 std::string outputFileName; 815 static const char *eventsFileName; 816 static const char *plotFileName; 817 static int printPerThreadFlag; 818 static int printPerThreadEventsFlag; 819 static const rgb_color globalColorArray[]; 820 static rgb_color timerColorInfo[]; 821 822 void init(); 823 static void setupEventColors(); 824 static void printPloticusFile(); 825 static void printHeaderInfo(FILE *statsOut); 826 static void printTimerStats(FILE *statsOut, statistic const *theStats, 827 statistic const *totalStats); 828 static void printCounterStats(FILE *statsOut, statistic const *theStats); 829 static void printCounters(FILE *statsOut, counter const *theCounters); 830 static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents, 831 int gtid); 832 static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; } 833 static void windupExplicitTimers(); 834 bool eventPrintingEnabled() const { return printPerThreadEventsFlag; } 835 836public: 837 kmp_stats_output_module() { init(); } 838 void outputStats(const char *heading); 839}; 840 841#ifdef __cplusplus 842extern "C" { 843#endif 844void __kmp_stats_init(); 845void __kmp_stats_fini(); 846void __kmp_reset_stats(); 847void __kmp_output_stats(const char *); 848void __kmp_accumulate_stats_at_exit(void); 849// thread local pointer to stats node within list 850extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr; 851// head to stats list. 852extern kmp_stats_list *__kmp_stats_list; 853// lock for __kmp_stats_list 854extern kmp_tas_lock_t __kmp_stats_lock; 855// reference start time 856extern tsc_tick_count __kmp_stats_start_time; 857// interface to output 858extern kmp_stats_output_module __kmp_stats_output; 859 860#ifdef __cplusplus 861} 862#endif 863 864// Simple, standard interfaces that drop out completely if stats aren't enabled 865 866/*! 867 * \brief Adds value to specified timer (name). 868 * 869 * @param name timer name as specified under the KMP_FOREACH_TIMER() macro 870 * @param value double precision sample value to add to statistics for the timer 871 * 872 * \details Use KMP_COUNT_VALUE(name, value) macro to add a particular value to 873 * a timer statistics. 874 * 875 * @ingroup STATS_GATHERING 876*/ 877#define KMP_COUNT_VALUE(name, value) \ 878 __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample(value) 879 880/*! 881 * \brief Increments specified counter (name). 882 * 883 * @param name counter name as specified under the KMP_FOREACH_COUNTER() macro 884 * 885 * \details Use KMP_COUNT_BLOCK(name, value) macro to increment a statistics 886 * counter for the executing thread. 887 * 888 * @ingroup STATS_GATHERING 889*/ 890#define KMP_COUNT_BLOCK(name) \ 891 __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment() 892 893/*! 894 * \brief Outputs the current thread statistics and reset them. 895 * 896 * @param heading_string heading put above the final stats output 897 * 898 * \details Explicitly stops all timers and outputs all stats. Environment 899 * variable, `OMPTB_STATSFILE=filename`, can be used to output the stats to a 900 * filename instead of stderr. Environment variable, 901 * `OMPTB_STATSTHREADS=true|undefined`, can be used to output thread specific 902 * stats. For now the `OMPTB_STATSTHREADS` environment variable can either be 903 * defined with any value, which will print out thread specific stats, or it can 904 * be undefined (not specified in the environment) and thread specific stats 905 * won't be printed. It should be noted that all statistics are reset when this 906 * macro is called. 907 * 908 * @ingroup STATS_GATHERING 909*/ 910#define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string) 911 912/*! 913 * \brief Initializes the paritioned timers to begin with name. 914 * 915 * @param name timer which you want this thread to begin with 916 * 917 * @ingroup STATS_GATHERING 918*/ 919#define KMP_INIT_PARTITIONED_TIMERS(name) \ 920 __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer( \ 921 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) 922 923#define KMP_TIME_PARTITIONED_BLOCK(name) \ 924 blockPartitionedTimer __PBLOCKTIME__( \ 925 __kmp_stats_thread_ptr->getPartitionedTimers(), \ 926 explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \ 927 TIMER_##name)) 928 929#define KMP_PUSH_PARTITIONED_TIMER(name) \ 930 __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer( \ 931 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) 932 933#define KMP_POP_PARTITIONED_TIMER() \ 934 __kmp_stats_thread_ptr->getPartitionedTimers()->pop() 935 936#define KMP_EXCHANGE_PARTITIONED_TIMER(name) \ 937 __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer( \ 938 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) 939 940#define KMP_SET_THREAD_STATE(state_name) \ 941 __kmp_stats_thread_ptr->setState(state_name) 942 943#define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState() 944 945#define KMP_SET_THREAD_STATE_BLOCK(state_name) \ 946 blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \ 947 state_name) 948 949/*! 950 * \brief resets all stats (counters to 0, timers to 0 elapsed ticks) 951 * 952 * \details Reset all stats for all threads. 953 * 954 * @ingroup STATS_GATHERING 955*/ 956#define KMP_RESET_STATS() __kmp_reset_stats() 957 958#if (KMP_DEVELOPER_STATS) 959#define KMP_TIME_DEVELOPER_BLOCK(n) KMP_TIME_BLOCK(n) 960#define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v) 961#define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n) 962#define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) KMP_START_EXPLICIT_TIMER(n) 963#define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) KMP_STOP_EXPLICIT_TIMER(n) 964#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n) 965#else 966// Null definitions 967#define KMP_TIME_DEVELOPER_BLOCK(n) ((void)0) 968#define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0) 969#define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0) 970#define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) ((void)0) 971#define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) ((void)0) 972#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0) 973#endif 974 975#else // KMP_STATS_ENABLED 976 977// Null definitions 978#define KMP_TIME_BLOCK(n) ((void)0) 979#define KMP_COUNT_VALUE(n, v) ((void)0) 980#define KMP_COUNT_BLOCK(n) ((void)0) 981#define KMP_START_EXPLICIT_TIMER(n) ((void)0) 982#define KMP_STOP_EXPLICIT_TIMER(n) ((void)0) 983 984#define KMP_OUTPUT_STATS(heading_string) ((void)0) 985#define KMP_RESET_STATS() ((void)0) 986 987#define KMP_TIME_DEVELOPER_BLOCK(n) ((void)0) 988#define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0) 989#define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0) 990#define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) ((void)0) 991#define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) ((void)0) 992#define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0) 993#define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0) 994#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0) 995#define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0) 996#define KMP_POP_PARTITIONED_TIMER() ((void)0) 997#define KMP_SET_THREAD_STATE(state_name) ((void)0) 998#define KMP_GET_THREAD_STATE() ((void)0) 999#define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0) 1000#endif // KMP_STATS_ENABLED 1001 1002#endif // KMP_STATS_H 1003