kmp_stats.h revision 345153
1#ifndef KMP_STATS_H
2#define KMP_STATS_H
3
4/** @file kmp_stats.h
5 * Functions for collecting statistics.
6 */
7
8//===----------------------------------------------------------------------===//
9//
10//                     The LLVM Compiler Infrastructure
11//
12// This file is dual licensed under the MIT and the University of Illinois Open
13// Source Licenses. See LICENSE.txt for details.
14//
15//===----------------------------------------------------------------------===//
16
17#include "kmp_config.h"
18#include "kmp_debug.h"
19
20#if KMP_STATS_ENABLED
21/* Statistics accumulator.
22   Accumulates number of samples and computes min, max, mean, standard deviation
23   on the fly.
24
25   Online variance calculation algorithm from
26   http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
27 */
28
29#include "kmp_stats_timing.h"
30#include <limits>
31#include <math.h>
32#include <new> // placement new
33#include <stdint.h>
34#include <string>
35#include <vector>
36
37/* Enable developer statistics here if you want them. They are more detailed
38   than is useful for application characterisation and are intended for the
39   runtime library developer. */
40#define KMP_DEVELOPER_STATS 0
41
42/* Enable/Disable histogram output */
43#define KMP_STATS_HIST 0
44
45/*!
46 * @ingroup STATS_GATHERING
47 * \brief flags to describe the statistic (timer or counter)
48 *
49 */
50enum stats_flags_e {
51  noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic
52  onlyInMaster = 1 << 1, //!< statistic is valid only for master
53  noUnits = 1 << 2, //!< statistic doesn't need units printed next to it
54  notInMaster = 1 << 3, //!< statistic is valid only for non-master threads
55  logEvent = 1 << 4 //!< statistic can be logged on the event timeline when
56  //! KMP_STATS_EVENTS is on (valid only for timers)
57};
58
59/*!
60 * @ingroup STATS_GATHERING
61 * \brief the states which a thread can be in
62 *
63 */
64enum stats_state_e {
65  IDLE,
66  SERIAL_REGION,
67  FORK_JOIN_BARRIER,
68  PLAIN_BARRIER,
69  TASKWAIT,
70  TASKYIELD,
71  TASKGROUP,
72  IMPLICIT_TASK,
73  EXPLICIT_TASK
74};
75
76/*!
77 * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h
78 *
79 * @param macro a user defined macro that takes three arguments -
80 * macro(COUNTER_NAME, flags, arg)
81 * @param arg a user defined argument to send to the user defined macro
82 *
83 * \details A counter counts the occurrence of some event. Each thread
84 * accumulates its own count, at the end of execution the counts are aggregated
85 * treating each thread as a separate measurement. (Unless onlyInMaster is set,
86 * in which case there's only a single measurement). The min,mean,max are
87 * therefore the values for the threads. Adding the counter here and then
88 * putting a KMP_BLOCK_COUNTER(name) at the point you want to count is all you
89 * need to do. All of the tables and printing is generated from this macro.
90 * Format is "macro(name, flags, arg)"
91 *
92 * @ingroup STATS_GATHERING
93 */
94// clang-format off
95#define KMP_FOREACH_COUNTER(macro, arg)                                        \
96  macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg)   \
97  macro(OMP_NESTED_PARALLEL, 0, arg)                                           \
98  macro(OMP_LOOP_STATIC, 0, arg)                                               \
99  macro(OMP_LOOP_STATIC_STEAL, 0, arg)                                         \
100  macro(OMP_LOOP_DYNAMIC, 0, arg)                                              \
101  macro(OMP_DISTRIBUTE, 0, arg)                                                \
102  macro(OMP_BARRIER, 0, arg)                                                   \
103  macro(OMP_CRITICAL, 0, arg)                                                  \
104  macro(OMP_SINGLE, 0, arg)                                                    \
105  macro(OMP_MASTER, 0, arg)                                                    \
106  macro(OMP_TEAMS, 0, arg)                                                     \
107  macro(OMP_set_lock, 0, arg)                                                  \
108  macro(OMP_test_lock, 0, arg)                                                 \
109  macro(REDUCE_wait, 0, arg)                                                   \
110  macro(REDUCE_nowait, 0, arg)                                                 \
111  macro(OMP_TASKYIELD, 0, arg)                                                 \
112  macro(OMP_TASKLOOP, 0, arg)                                                  \
113  macro(TASK_executed, 0, arg)                                                 \
114  macro(TASK_cancelled, 0, arg)                                                \
115  macro(TASK_stolen, 0, arg)
116// clang-format on
117
118/*!
119 * \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h
120 *
121 * @param macro a user defined macro that takes three arguments -
122 * macro(TIMER_NAME, flags, arg)
123 * @param arg a user defined argument to send to the user defined macro
124 *
125 * \details A timer collects multiple samples of some count in each thread and
126 * then finally aggregates all of the samples from all of the threads. For most
127 * timers the printing code also provides an aggregation over the thread totals.
128 * These are printed as TOTAL_foo. The count is normally a time (in ticks),
129 * hence the name "timer". (But can be any value, so we use this for "number of
130 * arguments passed to fork" as well). For timers the threads are not
131 * significant, it's the individual observations that count, so the statistics
132 * are at that level. Format is "macro(name, flags, arg)"
133 *
134 * @ingroup STATS_GATHERING2
135 */
136// clang-format off
137#define KMP_FOREACH_TIMER(macro, arg)                                          \
138  macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg)                 \
139  macro (OMP_parallel, stats_flags_e::logEvent, arg)                           \
140  macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg)                  \
141  macro (OMP_loop_static, 0, arg)                                              \
142  macro (OMP_loop_static_scheduling, 0, arg)                                   \
143  macro (OMP_loop_dynamic, 0, arg)                                             \
144  macro (OMP_loop_dynamic_scheduling, 0, arg)                                  \
145  macro (OMP_critical, 0, arg)                                                 \
146  macro (OMP_critical_wait, 0, arg)                                            \
147  macro (OMP_single, 0, arg)                                                   \
148  macro (OMP_master, 0, arg)                                                   \
149  macro (OMP_task_immediate, 0, arg)                                           \
150  macro (OMP_task_taskwait, 0, arg)                                            \
151  macro (OMP_task_taskyield, 0, arg)                                           \
152  macro (OMP_task_taskgroup, 0, arg)                                           \
153  macro (OMP_task_join_bar, 0, arg)                                            \
154  macro (OMP_task_plain_bar, 0, arg)                                           \
155  macro (OMP_taskloop_scheduling, 0, arg)                                      \
156  macro (OMP_plain_barrier, stats_flags_e::logEvent, arg)                      \
157  macro (OMP_idle, stats_flags_e::logEvent, arg)                               \
158  macro (OMP_fork_barrier, stats_flags_e::logEvent, arg)                       \
159  macro (OMP_join_barrier, stats_flags_e::logEvent, arg)                       \
160  macro (OMP_serial, stats_flags_e::logEvent, arg)                             \
161  macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal,  \
162         arg)                                                                  \
163  macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal,   \
164         arg)                                                                  \
165  macro (OMP_loop_static_iterations,                                           \
166         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
167  macro (OMP_loop_dynamic_iterations,                                          \
168         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
169  KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
170// clang-format on
171
172// OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
173//                           initializing OpenMP or being created by a master)
174//                           until the thread is destroyed
175// OMP_parallel           -- Time thread spends executing work directly
176//                           within a #pragma omp parallel
177// OMP_parallel_overhead  -- Time thread spends setting up a parallel region
178// OMP_loop_static        -- Time thread spends executing loop iterations from
179//                           a statically scheduled loop
180// OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
181//                               from a statically scheduled loop
182// OMP_loop_dynamic       -- Time thread spends executing loop iterations from
183//                           a dynamically scheduled loop
184// OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
185//                                from a dynamically scheduled loop
186// OMP_critical           -- Time thread spends executing critical section
187// OMP_critical_wait      -- Time thread spends waiting to enter
188//                           a critcal seciton
189// OMP_single             -- Time spent executing a "single" region
190// OMP_master             -- Time spent executing a "master" region
191// OMP_task_immediate     -- Time spent executing non-deferred tasks
192// OMP_task_taskwait      -- Time spent executing tasks inside a taskwait
193//                           construct
194// OMP_task_taskyield     -- Time spent executing tasks inside a taskyield
195//                           construct
196// OMP_task_taskgroup     -- Time spent executing tasks inside a taskygroup
197//                           construct
198// OMP_task_join_bar      -- Time spent executing tasks inside a join barrier
199// OMP_task_plain_bar     -- Time spent executing tasks inside a barrier
200//                           construct
201// OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
202//                            construct
203// OMP_plain_barrier      -- Time spent in a #pragma omp barrier construct or
204//                           inside implicit barrier at end of worksharing
205//                           construct
206// OMP_idle               -- Time worker threads spend waiting for next
207//                           parallel region
208// OMP_fork_barrier       -- Time spent in a the fork barrier surrounding a
209//                           parallel region
210// OMP_join_barrier       -- Time spent in a the join barrier surrounding a
211//                           parallel region
212// OMP_serial             -- Time thread zero spends executing serial code
213// OMP_set_numthreads     -- Values passed to omp_set_num_threads
214// OMP_PARALLEL_args      -- Number of arguments passed to a parallel region
215// OMP_loop_static_iterations -- Number of iterations thread is assigned for
216//                               statically scheduled loops
217// OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
218//                                dynamically scheduled loops
219
220#if (KMP_DEVELOPER_STATS)
221// Timers which are of interest to runtime library developers, not end users.
222// These have to be explicitly enabled in addition to the other stats.
223
224// KMP_fork_barrier       -- time in __kmp_fork_barrier
225// KMP_join_barrier       -- time in __kmp_join_barrier
226// KMP_barrier            -- time in __kmp_barrier
227// KMP_end_split_barrier  -- time in __kmp_end_split_barrier
228// KMP_setup_icv_copy     -- time in __kmp_setup_icv_copy
229// KMP_icv_copy           -- start/stop timer for any ICV copying
230// KMP_linear_gather      -- time in __kmp_linear_barrier_gather
231// KMP_linear_release     -- time in __kmp_linear_barrier_release
232// KMP_tree_gather        -- time in __kmp_tree_barrier_gather
233// KMP_tree_release       -- time in __kmp_tree_barrier_release
234// KMP_hyper_gather       -- time in __kmp_hyper_barrier_gather
235// KMP_hyper_release      -- time in __kmp_hyper_barrier_release
236// clang-format off
237#define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)                                \
238  macro(KMP_fork_call, 0, arg)                                                 \
239  macro(KMP_join_call, 0, arg)                                                 \
240  macro(KMP_end_split_barrier, 0, arg)                                         \
241  macro(KMP_hier_gather, 0, arg)                                               \
242  macro(KMP_hier_release, 0, arg)                                              \
243  macro(KMP_hyper_gather, 0, arg)                                              \
244  macro(KMP_hyper_release, 0, arg)                                             \
245  macro(KMP_linear_gather, 0, arg)                                             \
246  macro(KMP_linear_release, 0, arg)                                            \
247  macro(KMP_tree_gather, 0, arg)                                               \
248  macro(KMP_tree_release, 0, arg)                                              \
249  macro(USER_resume, 0, arg)                                                   \
250  macro(USER_suspend, 0, arg)                                                  \
251  macro(KMP_allocate_team, 0, arg)                                             \
252  macro(KMP_setup_icv_copy, 0, arg)                                            \
253  macro(USER_icv_copy, 0, arg)                                                 \
254  macro (FOR_static_steal_stolen,                                              \
255         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
256  macro (FOR_static_steal_chunks,                                              \
257         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
258#else
259#define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
260#endif
261// clang-format on
262
263/*!
264 * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
265 *
266 * @param macro a user defined macro that takes three arguments -
267 * macro(TIMER_NAME, flags, arg)
268 * @param arg a user defined argument to send to the user defined macro
269 *
270 * \warning YOU MUST HAVE THE SAME NAMED TIMER UNDER KMP_FOREACH_TIMER() OR ELSE
271 * BAD THINGS WILL HAPPEN!
272 *
273 * \details Explicit timers are ones where we need to allocate a timer itself
274 * (as well as the accumulated timing statistics). We allocate these on a
275 * per-thread basis, and explicitly start and stop them. Block timers just
276 * allocate the timer itself on the stack, and use the destructor to notice
277 * block exit; they don't need to be defined here. The name here should be the
278 * same as that of a timer above.
279 *
280 * @ingroup STATS_GATHERING
281*/
282#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg)
283
284#define ENUMERATE(name, ignore, prefix) prefix##name,
285enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST };
286
287enum explicit_timer_e {
288  KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST
289};
290
291enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
292#undef ENUMERATE
293
294/*
295 * A logarithmic histogram. It accumulates the number of values in each power of
296 * ten bin.  So 1<=x<10, 10<=x<100, ...
297 * Mostly useful where we have some big outliers and want to see information
298 * about them.
299 */
300class logHistogram {
301  enum {
302    numBins = 31, /* Number of powers of 10. If this changes you need to change
303                   * the initializer for binMax */
304
305    /*
306     * If you want to use this to analyse values that may be less than 1, (for
307     * instance times in s), then the logOffset gives you negative powers.
308     * In our case here, we're just looking at times in ticks, or counts, so we
309     * can never see values with magnitude < 1 (other than zero), so we can set
310     * it to 0.  As above change the initializer if you change this.
311     */
312    logOffset = 0
313  };
314  uint32_t KMP_ALIGN_CACHE zeroCount;
315  struct {
316    uint32_t count;
317    double total;
318  } bins[numBins];
319
320  static double binMax[numBins];
321
322#ifdef KMP_DEBUG
323  uint64_t _total;
324
325  void check() const {
326    uint64_t t = zeroCount;
327    for (int i = 0; i < numBins; i++)
328      t += bins[i].count;
329    KMP_DEBUG_ASSERT(t == _total);
330  }
331#else
332  void check() const {}
333#endif
334
335public:
336  logHistogram() { reset(); }
337
338  logHistogram(logHistogram const &o) {
339    for (int i = 0; i < numBins; i++)
340      bins[i] = o.bins[i];
341#ifdef KMP_DEBUG
342    _total = o._total;
343#endif
344  }
345
346  void reset() {
347    zeroCount = 0;
348    for (int i = 0; i < numBins; i++) {
349      bins[i].count = 0;
350      bins[i].total = 0;
351    }
352
353#ifdef KMP_DEBUG
354    _total = 0;
355#endif
356  }
357  uint32_t count(int b) const { return bins[b + logOffset].count; }
358  double total(int b) const { return bins[b + logOffset].total; }
359  static uint32_t findBin(double sample);
360
361  logHistogram &operator+=(logHistogram const &o) {
362    zeroCount += o.zeroCount;
363    for (int i = 0; i < numBins; i++) {
364      bins[i].count += o.bins[i].count;
365      bins[i].total += o.bins[i].total;
366    }
367#ifdef KMP_DEBUG
368    _total += o._total;
369    check();
370#endif
371
372    return *this;
373  }
374
375  void addSample(double sample);
376  int minBin() const;
377  int maxBin() const;
378
379  std::string format(char) const;
380};
381
382class statistic {
383  double KMP_ALIGN_CACHE minVal;
384  double maxVal;
385  double meanVal;
386  double m2;
387  uint64_t sampleCount;
388  double offset;
389  bool collectingHist;
390  logHistogram hist;
391
392public:
393  statistic(bool doHist = bool(KMP_STATS_HIST)) {
394    reset();
395    collectingHist = doHist;
396  }
397  statistic(statistic const &o)
398      : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
399        sampleCount(o.sampleCount), offset(o.offset),
400        collectingHist(o.collectingHist), hist(o.hist) {}
401  statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
402      : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
403        sampleCount(sc), offset(0.0), collectingHist(false) {}
404  bool haveHist() const { return collectingHist; }
405  double getMin() const { return minVal; }
406  double getMean() const { return meanVal; }
407  double getMax() const { return maxVal; }
408  uint64_t getCount() const { return sampleCount; }
409  double getSD() const { return sqrt(m2 / sampleCount); }
410  double getTotal() const { return sampleCount * meanVal; }
411  logHistogram const *getHist() const { return &hist; }
412  void setOffset(double d) { offset = d; }
413
414  void reset() {
415    minVal = std::numeric_limits<double>::max();
416    maxVal = -minVal;
417    meanVal = 0.0;
418    m2 = 0.0;
419    sampleCount = 0;
420    offset = 0.0;
421    hist.reset();
422  }
423  void addSample(double sample);
424  void scale(double factor);
425  void scaleDown(double f) { scale(1. / f); }
426  void forceCount(uint64_t count) { sampleCount = count; }
427  statistic &operator+=(statistic const &other);
428
429  std::string format(char unit, bool total = false) const;
430  std::string formatHist(char unit) const { return hist.format(unit); }
431};
432
433struct statInfo {
434  const char *name;
435  uint32_t flags;
436};
437
438class timeStat : public statistic {
439  static statInfo timerInfo[];
440
441public:
442  timeStat() : statistic() {}
443  static const char *name(timer_e e) { return timerInfo[e].name; }
444  static bool noTotal(timer_e e) {
445    return timerInfo[e].flags & stats_flags_e::noTotal;
446  }
447  static bool masterOnly(timer_e e) {
448    return timerInfo[e].flags & stats_flags_e::onlyInMaster;
449  }
450  static bool workerOnly(timer_e e) {
451    return timerInfo[e].flags & stats_flags_e::notInMaster;
452  }
453  static bool noUnits(timer_e e) {
454    return timerInfo[e].flags & stats_flags_e::noUnits;
455  }
456  static bool logEvent(timer_e e) {
457    return timerInfo[e].flags & stats_flags_e::logEvent;
458  }
459  static void clearEventFlags() {
460    for (int i = 0; i < TIMER_LAST; i++) {
461      timerInfo[i].flags &= (~(stats_flags_e::logEvent));
462    }
463  }
464};
465
466// Where we need explicitly to start and end the timer, this version can be used
467// Since these timers normally aren't nicely scoped, so don't have a good place
468// to live on the stack of the thread, they're more work to use.
469class explicitTimer {
470  timeStat *stat;
471  timer_e timerEnumValue;
472  tsc_tick_count startTime;
473  tsc_tick_count pauseStartTime;
474  tsc_tick_count::tsc_interval_t totalPauseTime;
475
476public:
477  explicitTimer(timeStat *s, timer_e te)
478      : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
479        totalPauseTime() {}
480
481  // void setStat(timeStat *s) { stat = s; }
482  void start(tsc_tick_count tick);
483  void pause(tsc_tick_count tick) { pauseStartTime = tick; }
484  void resume(tsc_tick_count tick) {
485    totalPauseTime += (tick - pauseStartTime);
486  }
487  void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
488  void reset() {
489    startTime = 0;
490    pauseStartTime = 0;
491    totalPauseTime = 0;
492  }
493  timer_e get_type() const { return timerEnumValue; }
494};
495
496// Where you need to partition a threads clock ticks into separate states
497// e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and
498// DOING_NOTHING would render these conditions:
499// time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive
500// No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice
501// versa
502class partitionedTimers {
503private:
504  std::vector<explicitTimer> timer_stack;
505
506public:
507  partitionedTimers();
508  void init(explicitTimer timer);
509  void exchange(explicitTimer timer);
510  void push(explicitTimer timer);
511  void pop();
512  void windup();
513};
514
515// Special wrapper around the partioned timers to aid timing code blocks
516// It avoids the need to have an explicit end, leaving the scope suffices.
517class blockPartitionedTimer {
518  partitionedTimers *part_timers;
519
520public:
521  blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
522      : part_timers(pt) {
523    part_timers->push(timer);
524  }
525  ~blockPartitionedTimer() { part_timers->pop(); }
526};
527
528// Special wrapper around the thread state to aid in keeping state in code
529// blocks It avoids the need to have an explicit end, leaving the scope
530// suffices.
531class blockThreadState {
532  stats_state_e *state_pointer;
533  stats_state_e old_state;
534
535public:
536  blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state)
537      : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) {
538    *state_pointer = new_state;
539  }
540  ~blockThreadState() { *state_pointer = old_state; }
541};
542
543// If all you want is a count, then you can use this...
544// The individual per-thread counts will be aggregated into a statistic at
545// program exit.
546class counter {
547  uint64_t value;
548  static const statInfo counterInfo[];
549
550public:
551  counter() : value(0) {}
552  void increment() { value++; }
553  uint64_t getValue() const { return value; }
554  void reset() { value = 0; }
555  static const char *name(counter_e e) { return counterInfo[e].name; }
556  static bool masterOnly(counter_e e) {
557    return counterInfo[e].flags & stats_flags_e::onlyInMaster;
558  }
559};
560
561/* ****************************************************************
562    Class to implement an event
563
564    There are four components to an event: start time, stop time
565    nest_level, and timer_name.
566    The start and stop time should be obvious (recorded in clock ticks).
567    The nest_level relates to the bar width in the timeline graph.
568    The timer_name is used to determine which timer event triggered this event.
569
570    the interface to this class is through four read-only operations:
571    1) getStart()     -- returns the start time as 64 bit integer
572    2) getStop()      -- returns the stop time as 64 bit integer
573    3) getNestLevel() -- returns the nest level of the event
574    4) getTimerName() -- returns the timer name that triggered event
575
576    *MORE ON NEST_LEVEL*
577    The nest level is used in the bar graph that represents the timeline.
578    Its main purpose is for showing how events are nested inside eachother.
579    For example, say events, A, B, and C are recorded.  If the timeline
580    looks like this:
581
582Begin -------------------------------------------------------------> Time
583         |    |          |        |          |              |
584         A    B          C        C          B              A
585       start start     start     end        end            end
586
587       Then A, B, C will have a nest level of 1, 2, 3 respectively.
588       These values are then used to calculate the barwidth so you can
589       see that inside A, B has occurred, and inside B, C has occurred.
590       Currently, this is shown with A's bar width being larger than B's
591       bar width, and B's bar width being larger than C's bar width.
592
593**************************************************************** */
594class kmp_stats_event {
595  uint64_t start;
596  uint64_t stop;
597  int nest_level;
598  timer_e timer_name;
599
600public:
601  kmp_stats_event()
602      : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {}
603  kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme)
604      : start(strt), stop(stp), nest_level(nst), timer_name(nme) {}
605  inline uint64_t getStart() const { return start; }
606  inline uint64_t getStop() const { return stop; }
607  inline int getNestLevel() const { return nest_level; }
608  inline timer_e getTimerName() const { return timer_name; }
609};
610
611/* ****************************************************************
612    Class to implement a dynamically expandable array of events
613
614    ---------------------------------------------------------
615    | event 1 | event 2 | event 3 | event 4 | ... | event N |
616    ---------------------------------------------------------
617
618    An event is pushed onto the back of this array at every
619    explicitTimer->stop() call.  The event records the thread #,
620    start time, stop time, and nest level related to the bar width.
621
622    The event vector starts at size INIT_SIZE and grows (doubles in size)
623    if needed.  An implication of this behavior is that log(N)
624    reallocations are needed (where N is number of events).  If you want
625    to avoid reallocations, then set INIT_SIZE to a large value.
626
627    the interface to this class is through six operations:
628    1) reset() -- sets the internal_size back to 0 but does not deallocate any
629       memory
630    2) size()  -- returns the number of valid elements in the vector
631    3) push_back(start, stop, nest, timer_name) -- pushes an event onto
632       the back of the array
633    4) deallocate() -- frees all memory associated with the vector
634    5) sort() -- sorts the vector by start time
635    6) operator[index] or at(index) -- returns event reference at that index
636**************************************************************** */
637class kmp_stats_event_vector {
638  kmp_stats_event *events;
639  int internal_size;
640  int allocated_size;
641  static const int INIT_SIZE = 1024;
642
643public:
644  kmp_stats_event_vector() {
645    events =
646        (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE);
647    internal_size = 0;
648    allocated_size = INIT_SIZE;
649  }
650  ~kmp_stats_event_vector() {}
651  inline void reset() { internal_size = 0; }
652  inline int size() const { return internal_size; }
653  void push_back(uint64_t start_time, uint64_t stop_time, int nest_level,
654                 timer_e name) {
655    int i;
656    if (internal_size == allocated_size) {
657      kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate(
658          sizeof(kmp_stats_event) * allocated_size * 2);
659      for (i = 0; i < internal_size; i++)
660        tmp[i] = events[i];
661      __kmp_free(events);
662      events = tmp;
663      allocated_size *= 2;
664    }
665    events[internal_size] =
666        kmp_stats_event(start_time, stop_time, nest_level, name);
667    internal_size++;
668    return;
669  }
670  void deallocate();
671  void sort();
672  const kmp_stats_event &operator[](int index) const { return events[index]; }
673  kmp_stats_event &operator[](int index) { return events[index]; }
674  const kmp_stats_event &at(int index) const { return events[index]; }
675  kmp_stats_event &at(int index) { return events[index]; }
676};
677
678/* ****************************************************************
679    Class to implement a doubly-linked, circular, statistics list
680
681    |---| ---> |---| ---> |---| ---> |---| ---> ... next
682    |   |      |   |      |   |      |   |
683    |---| <--- |---| <--- |---| <--- |---| <--- ... prev
684    Sentinel   first      second     third
685    Node       node       node       node
686
687    The Sentinel Node is the user handle on the list.
688    The first node corresponds to thread 0's statistics.
689    The second node corresponds to thread 1's statistics and so on...
690
691    Each node has a _timers, _counters, and _explicitTimers array to hold that
692    thread's statistics. The _explicitTimers point to the correct _timer and
693    update its statistics at every stop() call. The explicitTimers' pointers are
694    set up in the constructor. Each node also has an event vector to hold that
695    thread's timing events. The event vector expands as necessary and records
696    the start-stop times for each timer.
697
698    The nestLevel variable is for plotting events and is related
699    to the bar width in the timeline graph.
700
701    Every thread will have a thread local pointer to its node in
702    the list.  The sentinel node is used by the master thread to
703    store "dummy" statistics before __kmp_create_worker() is called.
704**************************************************************** */
705class kmp_stats_list {
706  int gtid;
707  timeStat _timers[TIMER_LAST + 1];
708  counter _counters[COUNTER_LAST + 1];
709  explicitTimer thread_life_timer;
710  partitionedTimers _partitionedTimers;
711  int _nestLevel; // one per thread
712  kmp_stats_event_vector _event_vector;
713  kmp_stats_list *next;
714  kmp_stats_list *prev;
715  stats_state_e state;
716  int thread_is_idle_flag;
717
718public:
719  kmp_stats_list()
720      : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
721                          TIMER_OMP_worker_thread_life),
722        _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
723        thread_is_idle_flag(0) {}
724  ~kmp_stats_list() {}
725  inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
726  inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
727  inline partitionedTimers *getPartitionedTimers() {
728    return &_partitionedTimers;
729  }
730  inline timeStat *getTimers() { return _timers; }
731  inline counter *getCounters() { return _counters; }
732  inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
733  inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
734  inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
735  inline void resetEventVector() { _event_vector.reset(); }
736  inline void incrementNestValue() { _nestLevel++; }
737  inline int getNestValue() { return _nestLevel; }
738  inline void decrementNestValue() { _nestLevel--; }
739  inline int getGtid() const { return gtid; }
740  inline void setGtid(int newgtid) { gtid = newgtid; }
741  inline void setState(stats_state_e newstate) { state = newstate; }
742  inline stats_state_e getState() const { return state; }
743  inline stats_state_e *getStatePointer() { return &state; }
744  inline bool isIdle() { return thread_is_idle_flag == 1; }
745  inline void setIdleFlag() { thread_is_idle_flag = 1; }
746  inline void resetIdleFlag() { thread_is_idle_flag = 0; }
747  kmp_stats_list *push_back(int gtid); // returns newly created list node
748  inline void push_event(uint64_t start_time, uint64_t stop_time,
749                         int nest_level, timer_e name) {
750    _event_vector.push_back(start_time, stop_time, nest_level, name);
751  }
752  void deallocate();
753  class iterator;
754  kmp_stats_list::iterator begin();
755  kmp_stats_list::iterator end();
756  int size();
757  class iterator {
758    kmp_stats_list *ptr;
759    friend kmp_stats_list::iterator kmp_stats_list::begin();
760    friend kmp_stats_list::iterator kmp_stats_list::end();
761
762  public:
763    iterator();
764    ~iterator();
765    iterator operator++();
766    iterator operator++(int dummy);
767    iterator operator--();
768    iterator operator--(int dummy);
769    bool operator!=(const iterator &rhs);
770    bool operator==(const iterator &rhs);
771    kmp_stats_list *operator*() const; // dereference operator
772  };
773};
774
775/* ****************************************************************
776   Class to encapsulate all output functions and the environment variables
777
778   This module holds filenames for various outputs (normal stats, events, plot
779   file), as well as coloring information for the plot file.
780
781   The filenames and flags variables are read from environment variables.
782   These are read once by the constructor of the global variable
783   __kmp_stats_output which calls init().
784
785   During this init() call, event flags for the timeStat::timerInfo[] global
786   array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
787
788   The only interface function that is public is outputStats(heading).  This
789   function should print out everything it needs to, either to files or stderr,
790   depending on the environment variables described below
791
792   ENVIRONMENT VARIABLES:
793   KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this
794                     file, otherwise, print to stderr
795   KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to
796                        either KMP_STATS_FILE or stderr
797   KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
798                          otherwise, the plot file is sent to "events.plt"
799   KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log
800                       events
801   KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
802                            otherwise, output is sent to "events.dat"
803**************************************************************** */
804class kmp_stats_output_module {
805
806public:
807  struct rgb_color {
808    float r;
809    float g;
810    float b;
811  };
812
813private:
814  std::string outputFileName;
815  static const char *eventsFileName;
816  static const char *plotFileName;
817  static int printPerThreadFlag;
818  static int printPerThreadEventsFlag;
819  static const rgb_color globalColorArray[];
820  static rgb_color timerColorInfo[];
821
822  void init();
823  static void setupEventColors();
824  static void printPloticusFile();
825  static void printHeaderInfo(FILE *statsOut);
826  static void printTimerStats(FILE *statsOut, statistic const *theStats,
827                              statistic const *totalStats);
828  static void printCounterStats(FILE *statsOut, statistic const *theStats);
829  static void printCounters(FILE *statsOut, counter const *theCounters);
830  static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents,
831                          int gtid);
832  static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; }
833  static void windupExplicitTimers();
834  bool eventPrintingEnabled() const { return printPerThreadEventsFlag; }
835
836public:
837  kmp_stats_output_module() { init(); }
838  void outputStats(const char *heading);
839};
840
841#ifdef __cplusplus
842extern "C" {
843#endif
844void __kmp_stats_init();
845void __kmp_stats_fini();
846void __kmp_reset_stats();
847void __kmp_output_stats(const char *);
848void __kmp_accumulate_stats_at_exit(void);
849// thread local pointer to stats node within list
850extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr;
851// head to stats list.
852extern kmp_stats_list *__kmp_stats_list;
853// lock for __kmp_stats_list
854extern kmp_tas_lock_t __kmp_stats_lock;
855// reference start time
856extern tsc_tick_count __kmp_stats_start_time;
857// interface to output
858extern kmp_stats_output_module __kmp_stats_output;
859
860#ifdef __cplusplus
861}
862#endif
863
864// Simple, standard interfaces that drop out completely if stats aren't enabled
865
866/*!
867 * \brief Adds value to specified timer (name).
868 *
869 * @param name timer name as specified under the KMP_FOREACH_TIMER() macro
870 * @param value double precision sample value to add to statistics for the timer
871 *
872 * \details Use KMP_COUNT_VALUE(name, value) macro to add a particular value to
873 * a timer statistics.
874 *
875 * @ingroup STATS_GATHERING
876*/
877#define KMP_COUNT_VALUE(name, value)                                           \
878  __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample(value)
879
880/*!
881 * \brief Increments specified counter (name).
882 *
883 * @param name counter name as specified under the KMP_FOREACH_COUNTER() macro
884 *
885 * \details Use KMP_COUNT_BLOCK(name, value) macro to increment a statistics
886 * counter for the executing thread.
887 *
888 * @ingroup STATS_GATHERING
889*/
890#define KMP_COUNT_BLOCK(name)                                                  \
891  __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
892
893/*!
894 * \brief Outputs the current thread statistics and reset them.
895 *
896 * @param heading_string heading put above the final stats output
897 *
898 * \details Explicitly stops all timers and outputs all stats. Environment
899 * variable, `OMPTB_STATSFILE=filename`, can be used to output the stats to a
900 * filename instead of stderr. Environment variable,
901 * `OMPTB_STATSTHREADS=true|undefined`, can be used to output thread specific
902 * stats. For now the `OMPTB_STATSTHREADS` environment variable can either be
903 * defined with any value, which will print out thread specific stats, or it can
904 * be undefined (not specified in the environment) and thread specific stats
905 * won't be printed. It should be noted that all statistics are reset when this
906 * macro is called.
907 *
908 * @ingroup STATS_GATHERING
909*/
910#define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
911
912/*!
913 * \brief Initializes the paritioned timers to begin with name.
914 *
915 * @param name timer which you want this thread to begin with
916 *
917 * @ingroup STATS_GATHERING
918*/
919#define KMP_INIT_PARTITIONED_TIMERS(name)                                      \
920  __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer(          \
921      __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
922
923#define KMP_TIME_PARTITIONED_BLOCK(name)                                       \
924  blockPartitionedTimer __PBLOCKTIME__(                                        \
925      __kmp_stats_thread_ptr->getPartitionedTimers(),                          \
926      explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name),            \
927                    TIMER_##name))
928
929#define KMP_PUSH_PARTITIONED_TIMER(name)                                       \
930  __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer(          \
931      __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
932
933#define KMP_POP_PARTITIONED_TIMER()                                            \
934  __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
935
936#define KMP_EXCHANGE_PARTITIONED_TIMER(name)                                   \
937  __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer(      \
938      __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
939
940#define KMP_SET_THREAD_STATE(state_name)                                       \
941  __kmp_stats_thread_ptr->setState(state_name)
942
943#define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState()
944
945#define KMP_SET_THREAD_STATE_BLOCK(state_name)                                 \
946  blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \
947                                    state_name)
948
949/*!
950 * \brief resets all stats (counters to 0, timers to 0 elapsed ticks)
951 *
952 * \details Reset all stats for all threads.
953 *
954 * @ingroup STATS_GATHERING
955*/
956#define KMP_RESET_STATS() __kmp_reset_stats()
957
958#if (KMP_DEVELOPER_STATS)
959#define KMP_TIME_DEVELOPER_BLOCK(n) KMP_TIME_BLOCK(n)
960#define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v)
961#define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
962#define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) KMP_START_EXPLICIT_TIMER(n)
963#define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) KMP_STOP_EXPLICIT_TIMER(n)
964#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n)
965#else
966// Null definitions
967#define KMP_TIME_DEVELOPER_BLOCK(n) ((void)0)
968#define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
969#define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
970#define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
971#define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
972#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
973#endif
974
975#else // KMP_STATS_ENABLED
976
977// Null definitions
978#define KMP_TIME_BLOCK(n) ((void)0)
979#define KMP_COUNT_VALUE(n, v) ((void)0)
980#define KMP_COUNT_BLOCK(n) ((void)0)
981#define KMP_START_EXPLICIT_TIMER(n) ((void)0)
982#define KMP_STOP_EXPLICIT_TIMER(n) ((void)0)
983
984#define KMP_OUTPUT_STATS(heading_string) ((void)0)
985#define KMP_RESET_STATS() ((void)0)
986
987#define KMP_TIME_DEVELOPER_BLOCK(n) ((void)0)
988#define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
989#define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
990#define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
991#define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
992#define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0)
993#define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0)
994#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
995#define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0)
996#define KMP_POP_PARTITIONED_TIMER() ((void)0)
997#define KMP_SET_THREAD_STATE(state_name) ((void)0)
998#define KMP_GET_THREAD_STATE() ((void)0)
999#define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0)
1000#endif // KMP_STATS_ENABLED
1001
1002#endif // KMP_STATS_H
1003