1#ifndef KMP_STATS_H
2#define KMP_STATS_H
3
4/** @file kmp_stats.h
5 * Functions for collecting statistics.
6 */
7
8//===----------------------------------------------------------------------===//
9//
10// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
11// See https://llvm.org/LICENSE.txt for license information.
12// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
13//
14//===----------------------------------------------------------------------===//
15
16#include "kmp_config.h"
17#include "kmp_debug.h"
18
19#if KMP_STATS_ENABLED
20/* Statistics accumulator.
21   Accumulates number of samples and computes min, max, mean, standard deviation
22   on the fly.
23
24   Online variance calculation algorithm from
25   http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
26 */
27
28#include "kmp_stats_timing.h"
29#include <limits>
30#include <math.h>
31#include <new> // placement new
32#include <stdint.h>
33#include <string>
34#include <vector>
35
36/* Enable developer statistics here if you want them. They are more detailed
37   than is useful for application characterisation and are intended for the
38   runtime library developer. */
39#define KMP_DEVELOPER_STATS 0
40
41/* Enable/Disable histogram output */
42#define KMP_STATS_HIST 0
43
44/*!
45 * @ingroup STATS_GATHERING
46 * \brief flags to describe the statistic (timer or counter)
47 *
48 */
49enum stats_flags_e {
50  noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic
51  onlyInMaster = 1 << 1, //!< statistic is valid only for master
52  noUnits = 1 << 2, //!< statistic doesn't need units printed next to it
53  notInMaster = 1 << 3, //!< statistic is valid only for non-master threads
54  logEvent = 1 << 4 //!< statistic can be logged on the event timeline when
55  //! KMP_STATS_EVENTS is on (valid only for timers)
56};
57
58/*!
59 * @ingroup STATS_GATHERING
60 * \brief the states which a thread can be in
61 *
62 */
63enum stats_state_e {
64  IDLE,
65  SERIAL_REGION,
66  FORK_JOIN_BARRIER,
67  PLAIN_BARRIER,
68  TASKWAIT,
69  TASKYIELD,
70  TASKGROUP,
71  IMPLICIT_TASK,
72  EXPLICIT_TASK,
73  TEAMS_REGION
74};
75
76/*!
77 * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h
78 *
79 * @param macro a user defined macro that takes three arguments -
80 * macro(COUNTER_NAME, flags, arg)
81 * @param arg a user defined argument to send to the user defined macro
82 *
83 * \details A counter counts the occurrence of some event. Each thread
84 * accumulates its own count, at the end of execution the counts are aggregated
85 * treating each thread as a separate measurement. (Unless onlyInMaster is set,
86 * in which case there's only a single measurement). The min,mean,max are
87 * therefore the values for the threads. Adding the counter here and then
88 * putting a KMP_BLOCK_COUNTER(name) at the point you want to count is all you
89 * need to do. All of the tables and printing is generated from this macro.
90 * Format is "macro(name, flags, arg)"
91 *
92 * @ingroup STATS_GATHERING
93 */
94// clang-format off
95#define KMP_FOREACH_COUNTER(macro, arg)                                        \
96  macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg)   \
97  macro(OMP_NESTED_PARALLEL, 0, arg)                                           \
98  macro(OMP_LOOP_STATIC, 0, arg)                                               \
99  macro(OMP_LOOP_STATIC_STEAL, 0, arg)                                         \
100  macro(OMP_LOOP_DYNAMIC, 0, arg)                                              \
101  macro(OMP_DISTRIBUTE, 0, arg)                                                \
102  macro(OMP_BARRIER, 0, arg)                                                   \
103  macro(OMP_CRITICAL, 0, arg)                                                  \
104  macro(OMP_SINGLE, 0, arg)                                                    \
105  macro(OMP_MASTER, 0, arg)                                                    \
106  macro(OMP_TEAMS, 0, arg)                                                     \
107  macro(OMP_set_lock, 0, arg)                                                  \
108  macro(OMP_test_lock, 0, arg)                                                 \
109  macro(REDUCE_wait, 0, arg)                                                   \
110  macro(REDUCE_nowait, 0, arg)                                                 \
111  macro(OMP_TASKYIELD, 0, arg)                                                 \
112  macro(OMP_TASKLOOP, 0, arg)                                                  \
113  macro(TASK_executed, 0, arg)                                                 \
114  macro(TASK_cancelled, 0, arg)                                                \
115  macro(TASK_stolen, 0, arg)
116// clang-format on
117
118/*!
119 * \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h
120 *
121 * @param macro a user defined macro that takes three arguments -
122 * macro(TIMER_NAME, flags, arg)
123 * @param arg a user defined argument to send to the user defined macro
124 *
125 * \details A timer collects multiple samples of some count in each thread and
126 * then finally aggregates all of the samples from all of the threads. For most
127 * timers the printing code also provides an aggregation over the thread totals.
128 * These are printed as TOTAL_foo. The count is normally a time (in ticks),
129 * hence the name "timer". (But can be any value, so we use this for "number of
130 * arguments passed to fork" as well). For timers the threads are not
131 * significant, it's the individual observations that count, so the statistics
132 * are at that level. Format is "macro(name, flags, arg)"
133 *
134 * @ingroup STATS_GATHERING2
135 */
136// clang-format off
137#define KMP_FOREACH_TIMER(macro, arg)                                          \
138  macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg)                 \
139  macro (OMP_parallel, stats_flags_e::logEvent, arg)                           \
140  macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg)                  \
141  macro (OMP_teams, stats_flags_e::logEvent, arg)                              \
142  macro (OMP_teams_overhead, stats_flags_e::logEvent, arg)                     \
143  macro (OMP_loop_static, 0, arg)                                              \
144  macro (OMP_loop_static_scheduling, 0, arg)                                   \
145  macro (OMP_loop_dynamic, 0, arg)                                             \
146  macro (OMP_loop_dynamic_scheduling, 0, arg)                                  \
147  macro (OMP_distribute, 0, arg)                                               \
148  macro (OMP_distribute_scheduling, 0, arg)                                    \
149  macro (OMP_critical, 0, arg)                                                 \
150  macro (OMP_critical_wait, 0, arg)                                            \
151  macro (OMP_single, 0, arg)                                                   \
152  macro (OMP_master, 0, arg)                                                   \
153  macro (OMP_task_immediate, 0, arg)                                           \
154  macro (OMP_task_taskwait, 0, arg)                                            \
155  macro (OMP_task_taskyield, 0, arg)                                           \
156  macro (OMP_task_taskgroup, 0, arg)                                           \
157  macro (OMP_task_join_bar, 0, arg)                                            \
158  macro (OMP_task_plain_bar, 0, arg)                                           \
159  macro (OMP_taskloop_scheduling, 0, arg)                                      \
160  macro (OMP_plain_barrier, stats_flags_e::logEvent, arg)                      \
161  macro (OMP_idle, stats_flags_e::logEvent, arg)                               \
162  macro (OMP_fork_barrier, stats_flags_e::logEvent, arg)                       \
163  macro (OMP_join_barrier, stats_flags_e::logEvent, arg)                       \
164  macro (OMP_serial, stats_flags_e::logEvent, arg)                             \
165  macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal,  \
166         arg)                                                                  \
167  macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal,   \
168         arg)                                                                  \
169  macro (OMP_loop_static_iterations,                                           \
170         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
171  macro (OMP_loop_static_total_iterations,                                     \
172         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
173  macro (OMP_loop_dynamic_iterations,                                          \
174         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
175  macro (OMP_loop_dynamic_total_iterations,                                    \
176         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
177  macro (OMP_distribute_iterations,                                            \
178         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
179  KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
180// clang-format on
181
182// OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
183//                           initializing OpenMP or being created by a master)
184//                           until the thread is destroyed
185// OMP_parallel           -- Time thread spends executing work directly
186//                           within a #pragma omp parallel
187// OMP_parallel_overhead  -- Time thread spends setting up a parallel region
188// OMP_loop_static        -- Time thread spends executing loop iterations from
189//                           a statically scheduled loop
190// OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
191//                               from a statically scheduled loop
192// OMP_loop_dynamic       -- Time thread spends executing loop iterations from
193//                           a dynamically scheduled loop
194// OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
195//                                from a dynamically scheduled loop
196// OMP_critical           -- Time thread spends executing critical section
197// OMP_critical_wait      -- Time thread spends waiting to enter
198//                           a critical section
199// OMP_single             -- Time spent executing a "single" region
200// OMP_master             -- Time spent executing a "master" region
201// OMP_task_immediate     -- Time spent executing non-deferred tasks
202// OMP_task_taskwait      -- Time spent executing tasks inside a taskwait
203//                           construct
204// OMP_task_taskyield     -- Time spent executing tasks inside a taskyield
205//                           construct
206// OMP_task_taskgroup     -- Time spent executing tasks inside a taskygroup
207//                           construct
208// OMP_task_join_bar      -- Time spent executing tasks inside a join barrier
209// OMP_task_plain_bar     -- Time spent executing tasks inside a barrier
210//                           construct
211// OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
212//                            construct
213// OMP_plain_barrier      -- Time spent in a #pragma omp barrier construct or
214//                           inside implicit barrier at end of worksharing
215//                           construct
216// OMP_idle               -- Time worker threads spend waiting for next
217//                           parallel region
218// OMP_fork_barrier       -- Time spent in a the fork barrier surrounding a
219//                           parallel region
220// OMP_join_barrier       -- Time spent in a the join barrier surrounding a
221//                           parallel region
222// OMP_serial             -- Time thread zero spends executing serial code
223// OMP_set_numthreads     -- Values passed to omp_set_num_threads
224// OMP_PARALLEL_args      -- Number of arguments passed to a parallel region
225// OMP_loop_static_iterations -- Number of iterations thread is assigned for
226//                               statically scheduled loops
227// OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
228//                                dynamically scheduled loops
229
230#if (KMP_DEVELOPER_STATS)
231// Timers which are of interest to runtime library developers, not end users.
232// These have to be explicitly enabled in addition to the other stats.
233
234// KMP_fork_barrier       -- time in __kmp_fork_barrier
235// KMP_join_barrier       -- time in __kmp_join_barrier
236// KMP_barrier            -- time in __kmp_barrier
237// KMP_end_split_barrier  -- time in __kmp_end_split_barrier
238// KMP_setup_icv_copy     -- time in __kmp_setup_icv_copy
239// KMP_icv_copy           -- start/stop timer for any ICV copying
240// KMP_linear_gather      -- time in __kmp_linear_barrier_gather
241// KMP_linear_release     -- time in __kmp_linear_barrier_release
242// KMP_tree_gather        -- time in __kmp_tree_barrier_gather
243// KMP_tree_release       -- time in __kmp_tree_barrier_release
244// KMP_hyper_gather       -- time in __kmp_hyper_barrier_gather
245// KMP_hyper_release      -- time in __kmp_hyper_barrier_release
246// clang-format off
247#define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)                                \
248  macro(KMP_fork_call, 0, arg)                                                 \
249  macro(KMP_join_call, 0, arg)                                                 \
250  macro(KMP_end_split_barrier, 0, arg)                                         \
251  macro(KMP_hier_gather, 0, arg)                                               \
252  macro(KMP_hier_release, 0, arg)                                              \
253  macro(KMP_hyper_gather, 0, arg)                                              \
254  macro(KMP_hyper_release, 0, arg)                                             \
255  macro(KMP_linear_gather, 0, arg)                                             \
256  macro(KMP_linear_release, 0, arg)                                            \
257  macro(KMP_tree_gather, 0, arg)                                               \
258  macro(KMP_tree_release, 0, arg)                                              \
259  macro(USER_resume, 0, arg)                                                   \
260  macro(USER_suspend, 0, arg)                                                  \
261  macro(KMP_allocate_team, 0, arg)                                             \
262  macro(KMP_setup_icv_copy, 0, arg)                                            \
263  macro(USER_icv_copy, 0, arg)                                                 \
264  macro (FOR_static_steal_stolen,                                              \
265         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
266  macro (FOR_static_steal_chunks,                                              \
267         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
268#else
269#define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
270#endif
271// clang-format on
272
273/*!
274 * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
275 *
276 * @param macro a user defined macro that takes three arguments -
277 * macro(TIMER_NAME, flags, arg)
278 * @param arg a user defined argument to send to the user defined macro
279 *
280 * \warning YOU MUST HAVE THE SAME NAMED TIMER UNDER KMP_FOREACH_TIMER() OR ELSE
281 * BAD THINGS WILL HAPPEN!
282 *
283 * \details Explicit timers are ones where we need to allocate a timer itself
284 * (as well as the accumulated timing statistics). We allocate these on a
285 * per-thread basis, and explicitly start and stop them. Block timers just
286 * allocate the timer itself on the stack, and use the destructor to notice
287 * block exit; they don't need to be defined here. The name here should be the
288 * same as that of a timer above.
289 *
290 * @ingroup STATS_GATHERING
291*/
292#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg)
293
294#define ENUMERATE(name, ignore, prefix) prefix##name,
295enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST };
296
297enum explicit_timer_e {
298  KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST
299};
300
301enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
302#undef ENUMERATE
303
304/*
305 * A logarithmic histogram. It accumulates the number of values in each power of
306 * ten bin.  So 1<=x<10, 10<=x<100, ...
307 * Mostly useful where we have some big outliers and want to see information
308 * about them.
309 */
310class logHistogram {
311  enum {
312    numBins = 31, /* Number of powers of 10. If this changes you need to change
313                   * the initializer for binMax */
314
315    /*
316     * If you want to use this to analyse values that may be less than 1, (for
317     * instance times in s), then the logOffset gives you negative powers.
318     * In our case here, we're just looking at times in ticks, or counts, so we
319     * can never see values with magnitude < 1 (other than zero), so we can set
320     * it to 0.  As above change the initializer if you change this.
321     */
322    logOffset = 0
323  };
324  uint32_t KMP_ALIGN_CACHE zeroCount;
325  struct {
326    uint32_t count;
327    double total;
328  } bins[numBins];
329
330  static double binMax[numBins];
331
332#ifdef KMP_DEBUG
333  uint64_t _total;
334
335  void check() const {
336    uint64_t t = zeroCount;
337    for (int i = 0; i < numBins; i++)
338      t += bins[i].count;
339    KMP_DEBUG_ASSERT(t == _total);
340  }
341#else
342  void check() const {}
343#endif
344
345public:
346  logHistogram() { reset(); }
347
348  logHistogram(logHistogram const &o) {
349    for (int i = 0; i < numBins; i++)
350      bins[i] = o.bins[i];
351#ifdef KMP_DEBUG
352    _total = o._total;
353#endif
354  }
355
356  void reset() {
357    zeroCount = 0;
358    for (int i = 0; i < numBins; i++) {
359      bins[i].count = 0;
360      bins[i].total = 0;
361    }
362
363#ifdef KMP_DEBUG
364    _total = 0;
365#endif
366  }
367  uint32_t count(int b) const { return bins[b + logOffset].count; }
368  double total(int b) const { return bins[b + logOffset].total; }
369  static uint32_t findBin(double sample);
370
371  logHistogram &operator+=(logHistogram const &o) {
372    zeroCount += o.zeroCount;
373    for (int i = 0; i < numBins; i++) {
374      bins[i].count += o.bins[i].count;
375      bins[i].total += o.bins[i].total;
376    }
377#ifdef KMP_DEBUG
378    _total += o._total;
379    check();
380#endif
381
382    return *this;
383  }
384
385  void addSample(double sample);
386  int minBin() const;
387  int maxBin() const;
388
389  std::string format(char) const;
390};
391
392class statistic {
393  double KMP_ALIGN_CACHE minVal;
394  double maxVal;
395  double meanVal;
396  double m2;
397  uint64_t sampleCount;
398  double offset;
399  bool collectingHist;
400  logHistogram hist;
401
402public:
403  statistic(bool doHist = bool(KMP_STATS_HIST)) {
404    reset();
405    collectingHist = doHist;
406  }
407  statistic(statistic const &o)
408      : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
409        sampleCount(o.sampleCount), offset(o.offset),
410        collectingHist(o.collectingHist), hist(o.hist) {}
411  statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
412      : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
413        sampleCount(sc), offset(0.0), collectingHist(false) {}
414  bool haveHist() const { return collectingHist; }
415  double getMin() const { return minVal; }
416  double getMean() const { return meanVal; }
417  double getMax() const { return maxVal; }
418  uint64_t getCount() const { return sampleCount; }
419  double getSD() const { return sqrt(m2 / sampleCount); }
420  double getTotal() const { return sampleCount * meanVal; }
421  logHistogram const *getHist() const { return &hist; }
422  void setOffset(double d) { offset = d; }
423
424  void reset() {
425    minVal = std::numeric_limits<double>::max();
426    maxVal = -minVal;
427    meanVal = 0.0;
428    m2 = 0.0;
429    sampleCount = 0;
430    offset = 0.0;
431    hist.reset();
432  }
433  void addSample(double sample);
434  void scale(double factor);
435  void scaleDown(double f) { scale(1. / f); }
436  void forceCount(uint64_t count) { sampleCount = count; }
437  statistic &operator+=(statistic const &other);
438
439  std::string format(char unit, bool total = false) const;
440  std::string formatHist(char unit) const { return hist.format(unit); }
441};
442
443struct statInfo {
444  const char *name;
445  uint32_t flags;
446};
447
448class timeStat : public statistic {
449  static statInfo timerInfo[];
450
451public:
452  timeStat() : statistic() {}
453  static const char *name(timer_e e) { return timerInfo[e].name; }
454  static bool noTotal(timer_e e) {
455    return timerInfo[e].flags & stats_flags_e::noTotal;
456  }
457  static bool masterOnly(timer_e e) {
458    return timerInfo[e].flags & stats_flags_e::onlyInMaster;
459  }
460  static bool workerOnly(timer_e e) {
461    return timerInfo[e].flags & stats_flags_e::notInMaster;
462  }
463  static bool noUnits(timer_e e) {
464    return timerInfo[e].flags & stats_flags_e::noUnits;
465  }
466  static bool logEvent(timer_e e) {
467    return timerInfo[e].flags & stats_flags_e::logEvent;
468  }
469  static void clearEventFlags() {
470    for (int i = 0; i < TIMER_LAST; i++) {
471      timerInfo[i].flags &= (~(stats_flags_e::logEvent));
472    }
473  }
474};
475
476// Where we need explicitly to start and end the timer, this version can be used
477// Since these timers normally aren't nicely scoped, so don't have a good place
478// to live on the stack of the thread, they're more work to use.
479class explicitTimer {
480  timeStat *stat;
481  timer_e timerEnumValue;
482  tsc_tick_count startTime;
483  tsc_tick_count pauseStartTime;
484  tsc_tick_count::tsc_interval_t totalPauseTime;
485
486public:
487  explicitTimer(timeStat *s, timer_e te)
488      : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
489        totalPauseTime() {}
490
491  // void setStat(timeStat *s) { stat = s; }
492  void start(tsc_tick_count tick);
493  void pause(tsc_tick_count tick) { pauseStartTime = tick; }
494  void resume(tsc_tick_count tick) {
495    totalPauseTime += (tick - pauseStartTime);
496  }
497  void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
498  void reset() {
499    startTime = 0;
500    pauseStartTime = 0;
501    totalPauseTime = 0;
502  }
503  timer_e get_type() const { return timerEnumValue; }
504};
505
506// Where you need to partition a threads clock ticks into separate states
507// e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and
508// DOING_NOTHING would render these conditions:
509// time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive
510// No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice
511// versa
512class partitionedTimers {
513private:
514  std::vector<explicitTimer> timer_stack;
515
516public:
517  partitionedTimers();
518  void init(explicitTimer timer);
519  void exchange(explicitTimer timer);
520  void push(explicitTimer timer);
521  void pop();
522  void windup();
523};
524
525// Special wrapper around the partitioned timers to aid timing code blocks
526// It avoids the need to have an explicit end, leaving the scope suffices.
527class blockPartitionedTimer {
528  partitionedTimers *part_timers;
529
530public:
531  blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
532      : part_timers(pt) {
533    part_timers->push(timer);
534  }
535  ~blockPartitionedTimer() { part_timers->pop(); }
536};
537
538// Special wrapper around the thread state to aid in keeping state in code
539// blocks It avoids the need to have an explicit end, leaving the scope
540// suffices.
541class blockThreadState {
542  stats_state_e *state_pointer;
543  stats_state_e old_state;
544
545public:
546  blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state)
547      : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) {
548    *state_pointer = new_state;
549  }
550  ~blockThreadState() { *state_pointer = old_state; }
551};
552
553// If all you want is a count, then you can use this...
554// The individual per-thread counts will be aggregated into a statistic at
555// program exit.
556class counter {
557  uint64_t value;
558  static const statInfo counterInfo[];
559
560public:
561  counter() : value(0) {}
562  void increment() { value++; }
563  uint64_t getValue() const { return value; }
564  void reset() { value = 0; }
565  static const char *name(counter_e e) { return counterInfo[e].name; }
566  static bool masterOnly(counter_e e) {
567    return counterInfo[e].flags & stats_flags_e::onlyInMaster;
568  }
569};
570
571/* ****************************************************************
572    Class to implement an event
573
574    There are four components to an event: start time, stop time
575    nest_level, and timer_name.
576    The start and stop time should be obvious (recorded in clock ticks).
577    The nest_level relates to the bar width in the timeline graph.
578    The timer_name is used to determine which timer event triggered this event.
579
580    the interface to this class is through four read-only operations:
581    1) getStart()     -- returns the start time as 64 bit integer
582    2) getStop()      -- returns the stop time as 64 bit integer
583    3) getNestLevel() -- returns the nest level of the event
584    4) getTimerName() -- returns the timer name that triggered event
585
586    *MORE ON NEST_LEVEL*
587    The nest level is used in the bar graph that represents the timeline.
588    Its main purpose is for showing how events are nested inside eachother.
589    For example, say events, A, B, and C are recorded.  If the timeline
590    looks like this:
591
592Begin -------------------------------------------------------------> Time
593         |    |          |        |          |              |
594         A    B          C        C          B              A
595       start start     start     end        end            end
596
597       Then A, B, C will have a nest level of 1, 2, 3 respectively.
598       These values are then used to calculate the barwidth so you can
599       see that inside A, B has occurred, and inside B, C has occurred.
600       Currently, this is shown with A's bar width being larger than B's
601       bar width, and B's bar width being larger than C's bar width.
602
603**************************************************************** */
604class kmp_stats_event {
605  uint64_t start;
606  uint64_t stop;
607  int nest_level;
608  timer_e timer_name;
609
610public:
611  kmp_stats_event()
612      : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {}
613  kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme)
614      : start(strt), stop(stp), nest_level(nst), timer_name(nme) {}
615  inline uint64_t getStart() const { return start; }
616  inline uint64_t getStop() const { return stop; }
617  inline int getNestLevel() const { return nest_level; }
618  inline timer_e getTimerName() const { return timer_name; }
619};
620
621/* ****************************************************************
622    Class to implement a dynamically expandable array of events
623
624    ---------------------------------------------------------
625    | event 1 | event 2 | event 3 | event 4 | ... | event N |
626    ---------------------------------------------------------
627
628    An event is pushed onto the back of this array at every
629    explicitTimer->stop() call.  The event records the thread #,
630    start time, stop time, and nest level related to the bar width.
631
632    The event vector starts at size INIT_SIZE and grows (doubles in size)
633    if needed.  An implication of this behavior is that log(N)
634    reallocations are needed (where N is number of events).  If you want
635    to avoid reallocations, then set INIT_SIZE to a large value.
636
637    the interface to this class is through six operations:
638    1) reset() -- sets the internal_size back to 0 but does not deallocate any
639       memory
640    2) size()  -- returns the number of valid elements in the vector
641    3) push_back(start, stop, nest, timer_name) -- pushes an event onto
642       the back of the array
643    4) deallocate() -- frees all memory associated with the vector
644    5) sort() -- sorts the vector by start time
645    6) operator[index] or at(index) -- returns event reference at that index
646**************************************************************** */
647class kmp_stats_event_vector {
648  kmp_stats_event *events;
649  int internal_size;
650  int allocated_size;
651  static const int INIT_SIZE = 1024;
652
653public:
654  kmp_stats_event_vector() {
655    events =
656        (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE);
657    internal_size = 0;
658    allocated_size = INIT_SIZE;
659  }
660  ~kmp_stats_event_vector() {}
661  inline void reset() { internal_size = 0; }
662  inline int size() const { return internal_size; }
663  void push_back(uint64_t start_time, uint64_t stop_time, int nest_level,
664                 timer_e name) {
665    int i;
666    if (internal_size == allocated_size) {
667      kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate(
668          sizeof(kmp_stats_event) * allocated_size * 2);
669      for (i = 0; i < internal_size; i++)
670        tmp[i] = events[i];
671      __kmp_free(events);
672      events = tmp;
673      allocated_size *= 2;
674    }
675    events[internal_size] =
676        kmp_stats_event(start_time, stop_time, nest_level, name);
677    internal_size++;
678    return;
679  }
680  void deallocate();
681  void sort();
682  const kmp_stats_event &operator[](int index) const { return events[index]; }
683  kmp_stats_event &operator[](int index) { return events[index]; }
684  const kmp_stats_event &at(int index) const { return events[index]; }
685  kmp_stats_event &at(int index) { return events[index]; }
686};
687
688/* ****************************************************************
689    Class to implement a doubly-linked, circular, statistics list
690
691    |---| ---> |---| ---> |---| ---> |---| ---> ... next
692    |   |      |   |      |   |      |   |
693    |---| <--- |---| <--- |---| <--- |---| <--- ... prev
694    Sentinel   first      second     third
695    Node       node       node       node
696
697    The Sentinel Node is the user handle on the list.
698    The first node corresponds to thread 0's statistics.
699    The second node corresponds to thread 1's statistics and so on...
700
701    Each node has a _timers, _counters, and _explicitTimers array to hold that
702    thread's statistics. The _explicitTimers point to the correct _timer and
703    update its statistics at every stop() call. The explicitTimers' pointers are
704    set up in the constructor. Each node also has an event vector to hold that
705    thread's timing events. The event vector expands as necessary and records
706    the start-stop times for each timer.
707
708    The nestLevel variable is for plotting events and is related
709    to the bar width in the timeline graph.
710
711    Every thread will have a thread local pointer to its node in
712    the list.  The sentinel node is used by the master thread to
713    store "dummy" statistics before __kmp_create_worker() is called.
714**************************************************************** */
715class kmp_stats_list {
716  int gtid;
717  timeStat _timers[TIMER_LAST + 1];
718  counter _counters[COUNTER_LAST + 1];
719  explicitTimer thread_life_timer;
720  partitionedTimers _partitionedTimers;
721  int _nestLevel; // one per thread
722  kmp_stats_event_vector _event_vector;
723  kmp_stats_list *next;
724  kmp_stats_list *prev;
725  stats_state_e state;
726  int thread_is_idle_flag;
727
728public:
729  kmp_stats_list()
730      : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
731                          TIMER_OMP_worker_thread_life),
732        _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
733        thread_is_idle_flag(0) {}
734  ~kmp_stats_list() {}
735  inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
736  inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
737  inline partitionedTimers *getPartitionedTimers() {
738    return &_partitionedTimers;
739  }
740  inline timeStat *getTimers() { return _timers; }
741  inline counter *getCounters() { return _counters; }
742  inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
743  inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
744  inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
745  inline void resetEventVector() { _event_vector.reset(); }
746  inline void incrementNestValue() { _nestLevel++; }
747  inline int getNestValue() { return _nestLevel; }
748  inline void decrementNestValue() { _nestLevel--; }
749  inline int getGtid() const { return gtid; }
750  inline void setGtid(int newgtid) { gtid = newgtid; }
751  inline void setState(stats_state_e newstate) { state = newstate; }
752  inline stats_state_e getState() const { return state; }
753  inline stats_state_e *getStatePointer() { return &state; }
754  inline bool isIdle() { return thread_is_idle_flag == 1; }
755  inline void setIdleFlag() { thread_is_idle_flag = 1; }
756  inline void resetIdleFlag() { thread_is_idle_flag = 0; }
757  kmp_stats_list *push_back(int gtid); // returns newly created list node
758  inline void push_event(uint64_t start_time, uint64_t stop_time,
759                         int nest_level, timer_e name) {
760    _event_vector.push_back(start_time, stop_time, nest_level, name);
761  }
762  void deallocate();
763  class iterator;
764  kmp_stats_list::iterator begin();
765  kmp_stats_list::iterator end();
766  int size();
767  class iterator {
768    kmp_stats_list *ptr;
769    friend kmp_stats_list::iterator kmp_stats_list::begin();
770    friend kmp_stats_list::iterator kmp_stats_list::end();
771
772  public:
773    iterator();
774    ~iterator();
775    iterator operator++();
776    iterator operator++(int dummy);
777    iterator operator--();
778    iterator operator--(int dummy);
779    bool operator!=(const iterator &rhs);
780    bool operator==(const iterator &rhs);
781    kmp_stats_list *operator*() const; // dereference operator
782  };
783};
784
785/* ****************************************************************
786   Class to encapsulate all output functions and the environment variables
787
788   This module holds filenames for various outputs (normal stats, events, plot
789   file), as well as coloring information for the plot file.
790
791   The filenames and flags variables are read from environment variables.
792   These are read once by the constructor of the global variable
793   __kmp_stats_output which calls init().
794
795   During this init() call, event flags for the timeStat::timerInfo[] global
796   array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
797
798   The only interface function that is public is outputStats(heading).  This
799   function should print out everything it needs to, either to files or stderr,
800   depending on the environment variables described below
801
802   ENVIRONMENT VARIABLES:
803   KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this
804                     file, otherwise, print to stderr
805   KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to
806                        either KMP_STATS_FILE or stderr
807   KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
808                          otherwise, the plot file is sent to "events.plt"
809   KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log
810                       events
811   KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
812                            otherwise, output is sent to "events.dat"
813**************************************************************** */
814class kmp_stats_output_module {
815
816public:
817  struct rgb_color {
818    float r;
819    float g;
820    float b;
821  };
822
823private:
824  std::string outputFileName;
825  static const char *eventsFileName;
826  static const char *plotFileName;
827  static int printPerThreadFlag;
828  static int printPerThreadEventsFlag;
829  static const rgb_color globalColorArray[];
830  static rgb_color timerColorInfo[];
831
832  void init();
833  static void setupEventColors();
834  static void printPloticusFile();
835  static void printHeaderInfo(FILE *statsOut);
836  static void printTimerStats(FILE *statsOut, statistic const *theStats,
837                              statistic const *totalStats);
838  static void printCounterStats(FILE *statsOut, statistic const *theStats);
839  static void printCounters(FILE *statsOut, counter const *theCounters);
840  static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents,
841                          int gtid);
842  static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; }
843  static void windupExplicitTimers();
844  bool eventPrintingEnabled() const { return printPerThreadEventsFlag; }
845
846public:
847  kmp_stats_output_module() { init(); }
848  void outputStats(const char *heading);
849};
850
851#ifdef __cplusplus
852extern "C" {
853#endif
854void __kmp_stats_init();
855void __kmp_stats_fini();
856void __kmp_reset_stats();
857void __kmp_output_stats(const char *);
858void __kmp_accumulate_stats_at_exit(void);
859// thread local pointer to stats node within list
860extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr;
861// head to stats list.
862extern kmp_stats_list *__kmp_stats_list;
863// lock for __kmp_stats_list
864extern kmp_tas_lock_t __kmp_stats_lock;
865// reference start time
866extern tsc_tick_count __kmp_stats_start_time;
867// interface to output
868extern kmp_stats_output_module __kmp_stats_output;
869
870#ifdef __cplusplus
871}
872#endif
873
874// Simple, standard interfaces that drop out completely if stats aren't enabled
875
876/*!
877 * \brief Adds value to specified timer (name).
878 *
879 * @param name timer name as specified under the KMP_FOREACH_TIMER() macro
880 * @param value double precision sample value to add to statistics for the timer
881 *
882 * \details Use KMP_COUNT_VALUE(name, value) macro to add a particular value to
883 * a timer statistics.
884 *
885 * @ingroup STATS_GATHERING
886*/
887#define KMP_COUNT_VALUE(name, value)                                           \
888  __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample(value)
889
890/*!
891 * \brief Increments specified counter (name).
892 *
893 * @param name counter name as specified under the KMP_FOREACH_COUNTER() macro
894 *
895 * \details Use KMP_COUNT_BLOCK(name, value) macro to increment a statistics
896 * counter for the executing thread.
897 *
898 * @ingroup STATS_GATHERING
899*/
900#define KMP_COUNT_BLOCK(name)                                                  \
901  __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
902
903/*!
904 * \brief Outputs the current thread statistics and reset them.
905 *
906 * @param heading_string heading put above the final stats output
907 *
908 * \details Explicitly stops all timers and outputs all stats. Environment
909 * variable, `OMPTB_STATSFILE=filename`, can be used to output the stats to a
910 * filename instead of stderr. Environment variable,
911 * `OMPTB_STATSTHREADS=true|undefined`, can be used to output thread specific
912 * stats. For now the `OMPTB_STATSTHREADS` environment variable can either be
913 * defined with any value, which will print out thread specific stats, or it can
914 * be undefined (not specified in the environment) and thread specific stats
915 * won't be printed. It should be noted that all statistics are reset when this
916 * macro is called.
917 *
918 * @ingroup STATS_GATHERING
919*/
920#define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
921
922/*!
923 * \brief Initializes the partitioned timers to begin with name.
924 *
925 * @param name timer which you want this thread to begin with
926 *
927 * @ingroup STATS_GATHERING
928*/
929#define KMP_INIT_PARTITIONED_TIMERS(name)                                      \
930  __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer(          \
931      __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
932
933#define KMP_TIME_PARTITIONED_BLOCK(name)                                       \
934  blockPartitionedTimer __PBLOCKTIME__(                                        \
935      __kmp_stats_thread_ptr->getPartitionedTimers(),                          \
936      explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name),            \
937                    TIMER_##name))
938
939#define KMP_PUSH_PARTITIONED_TIMER(name)                                       \
940  __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer(          \
941      __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
942
943#define KMP_POP_PARTITIONED_TIMER()                                            \
944  __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
945
946#define KMP_EXCHANGE_PARTITIONED_TIMER(name)                                   \
947  __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer(      \
948      __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
949
950#define KMP_SET_THREAD_STATE(state_name)                                       \
951  __kmp_stats_thread_ptr->setState(state_name)
952
953#define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState()
954
955#define KMP_SET_THREAD_STATE_BLOCK(state_name)                                 \
956  blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \
957                                    state_name)
958
959/*!
960 * \brief resets all stats (counters to 0, timers to 0 elapsed ticks)
961 *
962 * \details Reset all stats for all threads.
963 *
964 * @ingroup STATS_GATHERING
965*/
966#define KMP_RESET_STATS() __kmp_reset_stats()
967
968#if (KMP_DEVELOPER_STATS)
969#define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v)
970#define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
971#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n)
972#define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) KMP_PUSH_PARTITIONED_TIMER(n)
973#define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) KMP_POP_PARTITIONED_TIMER(n)
974#define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n)                            \
975  KMP_EXCHANGE_PARTITIONED_TIMER(n)
976#else
977// Null definitions
978#define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
979#define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
980#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
981#define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
982#define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
983#define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
984#endif
985
986#else // KMP_STATS_ENABLED
987
988// Null definitions
989#define KMP_COUNT_VALUE(n, v) ((void)0)
990#define KMP_COUNT_BLOCK(n) ((void)0)
991
992#define KMP_OUTPUT_STATS(heading_string) ((void)0)
993#define KMP_RESET_STATS() ((void)0)
994
995#define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
996#define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
997#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
998#define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
999#define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1000#define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1001#define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0)
1002#define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0)
1003#define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0)
1004#define KMP_POP_PARTITIONED_TIMER() ((void)0)
1005#define KMP_SET_THREAD_STATE(state_name) ((void)0)
1006#define KMP_GET_THREAD_STATE() ((void)0)
1007#define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0)
1008#endif // KMP_STATS_ENABLED
1009
1010#endif // KMP_STATS_H
1011