1/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13/* Dynamic scheduling initialization and dispatch.
14 *
15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16 *       it may change values between parallel regions.  __kmp_max_nth
17 *       is the largest value __kmp_nth may take, 1 is the smallest.
18 */
19
20#include "kmp.h"
21#include "kmp_error.h"
22#include "kmp_i18n.h"
23#include "kmp_itt.h"
24#include "kmp_stats.h"
25#include "kmp_str.h"
26#if KMP_USE_X87CONTROL
27#include <float.h>
28#endif
29#include "kmp_lock.h"
30#include "kmp_dispatch.h"
31#if KMP_USE_HIER_SCHED
32#include "kmp_dispatch_hier.h"
33#endif
34
35#if OMPT_SUPPORT
36#include "ompt-specific.h"
37#endif
38
39/* ------------------------------------------------------------------------ */
40/* ------------------------------------------------------------------------ */
41
42void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43  kmp_info_t *th;
44
45  KMP_DEBUG_ASSERT(gtid_ref);
46
47  if (__kmp_env_consistency_check) {
48    th = __kmp_threads[*gtid_ref];
49    if (th->th.th_root->r.r_active &&
50        (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51#if KMP_USE_DYNAMIC_LOCK
52      __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53#else
54      __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55#endif
56    }
57  }
58}
59
60void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61  kmp_info_t *th;
62
63  if (__kmp_env_consistency_check) {
64    th = __kmp_threads[*gtid_ref];
65    if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66      __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67    }
68  }
69}
70
71// Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72static inline int __kmp_get_monotonicity(enum sched_type schedule,
73                                         bool use_hier = false) {
74  // Pick up the nonmonotonic/monotonic bits from the scheduling type
75  int monotonicity;
76  // default to monotonic
77  monotonicity = SCHEDULE_MONOTONIC;
78  if (SCHEDULE_HAS_NONMONOTONIC(schedule))
79    monotonicity = SCHEDULE_NONMONOTONIC;
80  else if (SCHEDULE_HAS_MONOTONIC(schedule))
81    monotonicity = SCHEDULE_MONOTONIC;
82  return monotonicity;
83}
84
85// Initialize a dispatch_private_info_template<T> buffer for a particular
86// type of schedule,chunk.  The loop description is found in lb (lower bound),
87// ub (upper bound), and st (stride).  nproc is the number of threads relevant
88// to the scheduling (often the number of threads in a team, but not always if
89// hierarchical scheduling is used).  tid is the id of the thread calling
90// the function within the group of nproc threads.  It will have a value
91// between 0 and nproc - 1.  This is often just the thread id within a team, but
92// is not necessarily the case when using hierarchical scheduling.
93// loc is the source file location of the corresponding loop
94// gtid is the global thread id
95template <typename T>
96void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
97                                   dispatch_private_info_template<T> *pr,
98                                   enum sched_type schedule, T lb, T ub,
99                                   typename traits_t<T>::signed_t st,
100#if USE_ITT_BUILD
101                                   kmp_uint64 *cur_chunk,
102#endif
103                                   typename traits_t<T>::signed_t chunk,
104                                   T nproc, T tid) {
105  typedef typename traits_t<T>::unsigned_t UT;
106  typedef typename traits_t<T>::floating_t DBL;
107
108  int active;
109  T tc;
110  kmp_info_t *th;
111  kmp_team_t *team;
112  int monotonicity;
113  bool use_hier;
114
115#ifdef KMP_DEBUG
116  typedef typename traits_t<T>::signed_t ST;
117  {
118    char *buff;
119    // create format specifiers before the debug output
120    buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
121                            "pr:%%p lb:%%%s ub:%%%s st:%%%s "
122                            "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
123                            traits_t<T>::spec, traits_t<T>::spec,
124                            traits_t<ST>::spec, traits_t<ST>::spec,
125                            traits_t<T>::spec, traits_t<T>::spec);
126    KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
127    __kmp_str_free(&buff);
128  }
129#endif
130  /* setup data */
131  th = __kmp_threads[gtid];
132  team = th->th.th_team;
133  active = !team->t.t_serialized;
134
135#if USE_ITT_BUILD
136  int itt_need_metadata_reporting =
137      __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
138      KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
139      team->t.t_active_level == 1;
140#endif
141
142#if KMP_USE_HIER_SCHED
143  use_hier = pr->flags.use_hier;
144#else
145  use_hier = false;
146#endif
147
148  /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
149  monotonicity = __kmp_get_monotonicity(schedule, use_hier);
150  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
151
152  /* Pick up the nomerge/ordered bits from the scheduling type */
153  if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
154    pr->flags.nomerge = TRUE;
155    schedule =
156        (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
157  } else {
158    pr->flags.nomerge = FALSE;
159  }
160  pr->type_size = traits_t<T>::type_size; // remember the size of variables
161  if (kmp_ord_lower & schedule) {
162    pr->flags.ordered = TRUE;
163    schedule =
164        (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
165  } else {
166    pr->flags.ordered = FALSE;
167  }
168  // Ordered overrides nonmonotonic
169  if (pr->flags.ordered) {
170    monotonicity = SCHEDULE_MONOTONIC;
171  }
172
173  if (schedule == kmp_sch_static) {
174    schedule = __kmp_static;
175  } else {
176    if (schedule == kmp_sch_runtime) {
177      // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
178      // not specified)
179      schedule = team->t.t_sched.r_sched_type;
180      monotonicity = __kmp_get_monotonicity(schedule, use_hier);
181      schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
182      // Detail the schedule if needed (global controls are differentiated
183      // appropriately)
184      if (schedule == kmp_sch_guided_chunked) {
185        schedule = __kmp_guided;
186      } else if (schedule == kmp_sch_static) {
187        schedule = __kmp_static;
188      }
189      // Use the chunk size specified by OMP_SCHEDULE (or default if not
190      // specified)
191      chunk = team->t.t_sched.chunk;
192#if USE_ITT_BUILD
193      if (cur_chunk)
194        *cur_chunk = chunk;
195#endif
196#ifdef KMP_DEBUG
197      {
198        char *buff;
199        // create format specifiers before the debug output
200        buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
201                                "schedule:%%d chunk:%%%s\n",
202                                traits_t<ST>::spec);
203        KD_TRACE(10, (buff, gtid, schedule, chunk));
204        __kmp_str_free(&buff);
205      }
206#endif
207    } else {
208      if (schedule == kmp_sch_guided_chunked) {
209        schedule = __kmp_guided;
210      }
211      if (chunk <= 0) {
212        chunk = KMP_DEFAULT_CHUNK;
213      }
214    }
215
216    if (schedule == kmp_sch_auto) {
217      // mapping and differentiation: in the __kmp_do_serial_initialize()
218      schedule = __kmp_auto;
219#ifdef KMP_DEBUG
220      {
221        char *buff;
222        // create format specifiers before the debug output
223        buff = __kmp_str_format(
224            "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
225            "schedule:%%d chunk:%%%s\n",
226            traits_t<ST>::spec);
227        KD_TRACE(10, (buff, gtid, schedule, chunk));
228        __kmp_str_free(&buff);
229      }
230#endif
231    }
232#if KMP_STATIC_STEAL_ENABLED
233    // map nonmonotonic:dynamic to static steal
234    if (schedule == kmp_sch_dynamic_chunked) {
235      if (monotonicity == SCHEDULE_NONMONOTONIC)
236        schedule = kmp_sch_static_steal;
237    }
238#endif
239    /* guided analytical not safe for too many threads */
240    if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
241      schedule = kmp_sch_guided_iterative_chunked;
242      KMP_WARNING(DispatchManyThreads);
243    }
244    if (schedule == kmp_sch_runtime_simd) {
245      // compiler provides simd_width in the chunk parameter
246      schedule = team->t.t_sched.r_sched_type;
247      monotonicity = __kmp_get_monotonicity(schedule, use_hier);
248      schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
249      // Detail the schedule if needed (global controls are differentiated
250      // appropriately)
251      if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
252          schedule == __kmp_static) {
253        schedule = kmp_sch_static_balanced_chunked;
254      } else {
255        if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
256          schedule = kmp_sch_guided_simd;
257        }
258        chunk = team->t.t_sched.chunk * chunk;
259      }
260#if USE_ITT_BUILD
261      if (cur_chunk)
262        *cur_chunk = chunk;
263#endif
264#ifdef KMP_DEBUG
265      {
266        char *buff;
267        // create format specifiers before the debug output
268        buff = __kmp_str_format(
269            "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
270            " chunk:%%%s\n",
271            traits_t<ST>::spec);
272        KD_TRACE(10, (buff, gtid, schedule, chunk));
273        __kmp_str_free(&buff);
274      }
275#endif
276    }
277    pr->u.p.parm1 = chunk;
278  }
279  KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
280              "unknown scheduling type");
281
282  pr->u.p.count = 0;
283
284  if (__kmp_env_consistency_check) {
285    if (st == 0) {
286      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
287                            (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
288    }
289  }
290  // compute trip count
291  if (st == 1) { // most common case
292    if (ub >= lb) {
293      tc = ub - lb + 1;
294    } else { // ub < lb
295      tc = 0; // zero-trip
296    }
297  } else if (st < 0) {
298    if (lb >= ub) {
299      // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
300      // where the division needs to be unsigned regardless of the result type
301      tc = (UT)(lb - ub) / (-st) + 1;
302    } else { // lb < ub
303      tc = 0; // zero-trip
304    }
305  } else { // st > 0
306    if (ub >= lb) {
307      // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
308      // where the division needs to be unsigned regardless of the result type
309      tc = (UT)(ub - lb) / st + 1;
310    } else { // ub < lb
311      tc = 0; // zero-trip
312    }
313  }
314
315#if KMP_STATS_ENABLED
316  if (KMP_MASTER_GTID(gtid)) {
317    KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
318  }
319#endif
320
321  pr->u.p.lb = lb;
322  pr->u.p.ub = ub;
323  pr->u.p.st = st;
324  pr->u.p.tc = tc;
325
326#if KMP_OS_WINDOWS
327  pr->u.p.last_upper = ub + st;
328#endif /* KMP_OS_WINDOWS */
329
330  /* NOTE: only the active parallel region(s) has active ordered sections */
331
332  if (active) {
333    if (pr->flags.ordered) {
334      pr->ordered_bumped = 0;
335      pr->u.p.ordered_lower = 1;
336      pr->u.p.ordered_upper = 0;
337    }
338  }
339
340  switch (schedule) {
341#if (KMP_STATIC_STEAL_ENABLED)
342  case kmp_sch_static_steal: {
343    T ntc, init;
344
345    KD_TRACE(100,
346             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
347              gtid));
348
349    ntc = (tc % chunk ? 1 : 0) + tc / chunk;
350    if (nproc > 1 && ntc >= nproc) {
351      KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
352      T id = tid;
353      T small_chunk, extras;
354
355      small_chunk = ntc / nproc;
356      extras = ntc % nproc;
357
358      init = id * small_chunk + (id < extras ? id : extras);
359      pr->u.p.count = init;
360      pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
361
362      pr->u.p.parm2 = lb;
363      // parm3 is the number of times to attempt stealing which is
364      // proportional to the number of chunks per thread up until
365      // the maximum value of nproc.
366      pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
367      pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
368      pr->u.p.st = st;
369      if (traits_t<T>::type_size > 4) {
370        // AC: TODO: check if 16-byte CAS available and use it to
371        // improve performance (probably wait for explicit request
372        // before spending time on this).
373        // For now use dynamically allocated per-thread lock,
374        // free memory in __kmp_dispatch_next when status==0.
375        KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL);
376        pr->u.p.th_steal_lock =
377            (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
378        __kmp_init_lock(pr->u.p.th_steal_lock);
379      }
380      break;
381    } else {
382      /* too few chunks: switching to kmp_sch_dynamic_chunked */
383      schedule = kmp_sch_dynamic_chunked;
384      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
385                     "kmp_sch_dynamic_chunked\n",
386                      gtid));
387      if (pr->u.p.parm1 <= 0)
388        pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
389      break;
390    } // if
391  } // case
392#endif
393  case kmp_sch_static_balanced: {
394    T init, limit;
395
396    KD_TRACE(
397        100,
398        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
399         gtid));
400
401    if (nproc > 1) {
402      T id = tid;
403
404      if (tc < nproc) {
405        if (id < tc) {
406          init = id;
407          limit = id;
408          pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
409        } else {
410          pr->u.p.count = 1; /* means no more chunks to execute */
411          pr->u.p.parm1 = FALSE;
412          break;
413        }
414      } else {
415        T small_chunk = tc / nproc;
416        T extras = tc % nproc;
417        init = id * small_chunk + (id < extras ? id : extras);
418        limit = init + small_chunk - (id < extras ? 0 : 1);
419        pr->u.p.parm1 = (id == nproc - 1);
420      }
421    } else {
422      if (tc > 0) {
423        init = 0;
424        limit = tc - 1;
425        pr->u.p.parm1 = TRUE;
426      } else {
427        // zero trip count
428        pr->u.p.count = 1; /* means no more chunks to execute */
429        pr->u.p.parm1 = FALSE;
430        break;
431      }
432    }
433#if USE_ITT_BUILD
434    // Calculate chunk for metadata report
435    if (itt_need_metadata_reporting)
436      if (cur_chunk)
437        *cur_chunk = limit - init + 1;
438#endif
439    if (st == 1) {
440      pr->u.p.lb = lb + init;
441      pr->u.p.ub = lb + limit;
442    } else {
443      // calculated upper bound, "ub" is user-defined upper bound
444      T ub_tmp = lb + limit * st;
445      pr->u.p.lb = lb + init * st;
446      // adjust upper bound to "ub" if needed, so that MS lastprivate will match
447      // it exactly
448      if (st > 0) {
449        pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
450      } else {
451        pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
452      }
453    }
454    if (pr->flags.ordered) {
455      pr->u.p.ordered_lower = init;
456      pr->u.p.ordered_upper = limit;
457    }
458    break;
459  } // case
460  case kmp_sch_static_balanced_chunked: {
461    // similar to balanced, but chunk adjusted to multiple of simd width
462    T nth = nproc;
463    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
464                   " -> falling-through to static_greedy\n",
465                   gtid));
466    schedule = kmp_sch_static_greedy;
467    if (nth > 1)
468      pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
469    else
470      pr->u.p.parm1 = tc;
471    break;
472  } // case
473  case kmp_sch_guided_simd:
474  case kmp_sch_guided_iterative_chunked: {
475    KD_TRACE(
476        100,
477        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
478         " case\n",
479         gtid));
480
481    if (nproc > 1) {
482      if ((2L * chunk + 1) * nproc >= tc) {
483        /* chunk size too large, switch to dynamic */
484        schedule = kmp_sch_dynamic_chunked;
485      } else {
486        // when remaining iters become less than parm2 - switch to dynamic
487        pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
488        *(double *)&pr->u.p.parm3 =
489            guided_flt_param / nproc; // may occupy parm3 and parm4
490      }
491    } else {
492      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
493                     "kmp_sch_static_greedy\n",
494                     gtid));
495      schedule = kmp_sch_static_greedy;
496      /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
497      KD_TRACE(
498          100,
499          ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
500           gtid));
501      pr->u.p.parm1 = tc;
502    } // if
503  } // case
504  break;
505  case kmp_sch_guided_analytical_chunked: {
506    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
507                   "kmp_sch_guided_analytical_chunked case\n",
508                   gtid));
509
510    if (nproc > 1) {
511      if ((2L * chunk + 1) * nproc >= tc) {
512        /* chunk size too large, switch to dynamic */
513        schedule = kmp_sch_dynamic_chunked;
514      } else {
515        /* commonly used term: (2 nproc - 1)/(2 nproc) */
516        DBL x;
517
518#if KMP_USE_X87CONTROL
519        /* Linux* OS already has 64-bit computation by default for long double,
520           and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
521           Windows* OS on IA-32 architecture, we need to set precision to 64-bit
522           instead of the default 53-bit. Even though long double doesn't work
523           on Windows* OS on Intel(R) 64, the resulting lack of precision is not
524           expected to impact the correctness of the algorithm, but this has not
525           been mathematically proven. */
526        // save original FPCW and set precision to 64-bit, as
527        // Windows* OS on IA-32 architecture defaults to 53-bit
528        unsigned int oldFpcw = _control87(0, 0);
529        _control87(_PC_64, _MCW_PC); // 0,0x30000
530#endif
531        /* value used for comparison in solver for cross-over point */
532        long double target = ((long double)chunk * 2 + 1) * nproc / tc;
533
534        /* crossover point--chunk indexes equal to or greater than
535           this point switch to dynamic-style scheduling */
536        UT cross;
537
538        /* commonly used term: (2 nproc - 1)/(2 nproc) */
539        x = (long double)1.0 - (long double)0.5 / nproc;
540
541#ifdef KMP_DEBUG
542        { // test natural alignment
543          struct _test_a {
544            char a;
545            union {
546              char b;
547              DBL d;
548            };
549          } t;
550          ptrdiff_t natural_alignment =
551              (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
552          //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
553          // long)natural_alignment );
554          KMP_DEBUG_ASSERT(
555              (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
556        }
557#endif // KMP_DEBUG
558
559        /* save the term in thread private dispatch structure */
560        *(DBL *)&pr->u.p.parm3 = x;
561
562        /* solve for the crossover point to the nearest integer i for which C_i
563           <= chunk */
564        {
565          UT left, right, mid;
566          long double p;
567
568          /* estimate initial upper and lower bound */
569
570          /* doesn't matter what value right is as long as it is positive, but
571             it affects performance of the solver */
572          right = 229;
573          p = __kmp_pow<UT>(x, right);
574          if (p > target) {
575            do {
576              p *= p;
577              right <<= 1;
578            } while (p > target && right < (1 << 27));
579            /* lower bound is previous (failed) estimate of upper bound */
580            left = right >> 1;
581          } else {
582            left = 0;
583          }
584
585          /* bisection root-finding method */
586          while (left + 1 < right) {
587            mid = (left + right) / 2;
588            if (__kmp_pow<UT>(x, mid) > target) {
589              left = mid;
590            } else {
591              right = mid;
592            }
593          } // while
594          cross = right;
595        }
596        /* assert sanity of computed crossover point */
597        KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
598                   __kmp_pow<UT>(x, cross) <= target);
599
600        /* save the crossover point in thread private dispatch structure */
601        pr->u.p.parm2 = cross;
602
603// C75803
604#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
605#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
606#else
607#define GUIDED_ANALYTICAL_WORKAROUND (x)
608#endif
609        /* dynamic-style scheduling offset */
610        pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
611                                 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
612                        cross * chunk;
613#if KMP_USE_X87CONTROL
614        // restore FPCW
615        _control87(oldFpcw, _MCW_PC);
616#endif
617      } // if
618    } else {
619      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
620                     "kmp_sch_static_greedy\n",
621                     gtid));
622      schedule = kmp_sch_static_greedy;
623      /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
624      pr->u.p.parm1 = tc;
625    } // if
626  } // case
627  break;
628  case kmp_sch_static_greedy:
629    KD_TRACE(
630        100,
631        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
632         gtid));
633    pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
634    break;
635  case kmp_sch_static_chunked:
636  case kmp_sch_dynamic_chunked:
637    if (pr->u.p.parm1 <= 0) {
638      pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
639    }
640    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
641                   "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
642                   gtid));
643    break;
644  case kmp_sch_trapezoidal: {
645    /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
646
647    T parm1, parm2, parm3, parm4;
648    KD_TRACE(100,
649             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
650              gtid));
651
652    parm1 = chunk;
653
654    /* F : size of the first cycle */
655    parm2 = (tc / (2 * nproc));
656
657    if (parm2 < 1) {
658      parm2 = 1;
659    }
660
661    /* L : size of the last cycle.  Make sure the last cycle is not larger
662       than the first cycle. */
663    if (parm1 < 1) {
664      parm1 = 1;
665    } else if (parm1 > parm2) {
666      parm1 = parm2;
667    }
668
669    /* N : number of cycles */
670    parm3 = (parm2 + parm1);
671    parm3 = (2 * tc + parm3 - 1) / parm3;
672
673    if (parm3 < 2) {
674      parm3 = 2;
675    }
676
677    /* sigma : decreasing incr of the trapezoid */
678    parm4 = (parm3 - 1);
679    parm4 = (parm2 - parm1) / parm4;
680
681    // pointless check, because parm4 >= 0 always
682    // if ( parm4 < 0 ) {
683    //    parm4 = 0;
684    //}
685
686    pr->u.p.parm1 = parm1;
687    pr->u.p.parm2 = parm2;
688    pr->u.p.parm3 = parm3;
689    pr->u.p.parm4 = parm4;
690  } // case
691  break;
692
693  default: {
694    __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
695                KMP_HNT(GetNewerLibrary), // Hint
696                __kmp_msg_null // Variadic argument list terminator
697                );
698  } break;
699  } // switch
700  pr->schedule = schedule;
701}
702
703#if KMP_USE_HIER_SCHED
704template <typename T>
705inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
706                                             typename traits_t<T>::signed_t st);
707template <>
708inline void
709__kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
710                                            kmp_int32 ub, kmp_int32 st) {
711  __kmp_dispatch_init_hierarchy<kmp_int32>(
712      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
713      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
714}
715template <>
716inline void
717__kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
718                                             kmp_uint32 ub, kmp_int32 st) {
719  __kmp_dispatch_init_hierarchy<kmp_uint32>(
720      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
721      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
722}
723template <>
724inline void
725__kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
726                                            kmp_int64 ub, kmp_int64 st) {
727  __kmp_dispatch_init_hierarchy<kmp_int64>(
728      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
729      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
730}
731template <>
732inline void
733__kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
734                                             kmp_uint64 ub, kmp_int64 st) {
735  __kmp_dispatch_init_hierarchy<kmp_uint64>(
736      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
737      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
738}
739
740// free all the hierarchy scheduling memory associated with the team
741void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
742  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
743  for (int i = 0; i < num_disp_buff; ++i) {
744    // type does not matter here so use kmp_int32
745    auto sh =
746        reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
747            &team->t.t_disp_buffer[i]);
748    if (sh->hier) {
749      sh->hier->deallocate();
750      __kmp_free(sh->hier);
751    }
752  }
753}
754#endif
755
756// UT - unsigned flavor of T, ST - signed flavor of T,
757// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
758template <typename T>
759static void
760__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
761                    T ub, typename traits_t<T>::signed_t st,
762                    typename traits_t<T>::signed_t chunk, int push_ws) {
763  typedef typename traits_t<T>::unsigned_t UT;
764
765  int active;
766  kmp_info_t *th;
767  kmp_team_t *team;
768  kmp_uint32 my_buffer_index;
769  dispatch_private_info_template<T> *pr;
770  dispatch_shared_info_template<T> volatile *sh;
771
772  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
773                   sizeof(dispatch_private_info));
774  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
775                   sizeof(dispatch_shared_info));
776
777  if (!TCR_4(__kmp_init_parallel))
778    __kmp_parallel_initialize();
779
780  __kmp_resume_if_soft_paused();
781
782#if INCLUDE_SSC_MARKS
783  SSC_MARK_DISPATCH_INIT();
784#endif
785#ifdef KMP_DEBUG
786  typedef typename traits_t<T>::signed_t ST;
787  {
788    char *buff;
789    // create format specifiers before the debug output
790    buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
791                            "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
792                            traits_t<ST>::spec, traits_t<T>::spec,
793                            traits_t<T>::spec, traits_t<ST>::spec);
794    KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
795    __kmp_str_free(&buff);
796  }
797#endif
798  /* setup data */
799  th = __kmp_threads[gtid];
800  team = th->th.th_team;
801  active = !team->t.t_serialized;
802  th->th.th_ident = loc;
803
804  // Any half-decent optimizer will remove this test when the blocks are empty
805  // since the macros expand to nothing
806  // when statistics are disabled.
807  if (schedule == __kmp_static) {
808    KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
809  } else {
810    KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
811  }
812
813#if KMP_USE_HIER_SCHED
814  // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
815  // Hierarchical scheduling does not work with ordered, so if ordered is
816  // detected, then revert back to threaded scheduling.
817  bool ordered;
818  enum sched_type my_sched = schedule;
819  my_buffer_index = th->th.th_dispatch->th_disp_index;
820  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
821      &th->th.th_dispatch
822           ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
823  my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
824  if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
825    my_sched =
826        (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
827  ordered = (kmp_ord_lower & my_sched);
828  if (pr->flags.use_hier) {
829    if (ordered) {
830      KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
831                     "Disabling hierarchical scheduling.\n",
832                     gtid));
833      pr->flags.use_hier = FALSE;
834    }
835  }
836  if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
837    // Don't use hierarchical for ordered parallel loops and don't
838    // use the runtime hierarchy if one was specified in the program
839    if (!ordered && !pr->flags.use_hier)
840      __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
841  }
842#endif // KMP_USE_HIER_SCHED
843
844#if USE_ITT_BUILD
845  kmp_uint64 cur_chunk = chunk;
846  int itt_need_metadata_reporting =
847      __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
848      KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
849      team->t.t_active_level == 1;
850#endif
851  if (!active) {
852    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
853        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
854  } else {
855    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
856                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
857
858    my_buffer_index = th->th.th_dispatch->th_disp_index++;
859
860    /* What happens when number of threads changes, need to resize buffer? */
861    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
862        &th->th.th_dispatch
863             ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
864    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
865        &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
866    KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
867                  my_buffer_index));
868  }
869
870  __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
871#if USE_ITT_BUILD
872                                &cur_chunk,
873#endif
874                                chunk, (T)th->th.th_team_nproc,
875                                (T)th->th.th_info.ds.ds_tid);
876  if (active) {
877    if (pr->flags.ordered == 0) {
878      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
879      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
880    } else {
881      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
882      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
883    }
884  }
885
886  if (active) {
887    /* The name of this buffer should be my_buffer_index when it's free to use
888     * it */
889
890    KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
891                   "sh->buffer_index:%d\n",
892                   gtid, my_buffer_index, sh->buffer_index));
893    __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
894                           __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
895    // Note: KMP_WAIT() cannot be used there: buffer index and
896    // my_buffer_index are *always* 32-bit integers.
897    KMP_MB(); /* is this necessary? */
898    KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
899                   "sh->buffer_index:%d\n",
900                   gtid, my_buffer_index, sh->buffer_index));
901
902    th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
903    th->th.th_dispatch->th_dispatch_sh_current =
904        CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
905#if USE_ITT_BUILD
906    if (pr->flags.ordered) {
907      __kmp_itt_ordered_init(gtid);
908    }
909    // Report loop metadata
910    if (itt_need_metadata_reporting) {
911      // Only report metadata by master of active team at level 1
912      kmp_uint64 schedtype = 0;
913      switch (schedule) {
914      case kmp_sch_static_chunked:
915      case kmp_sch_static_balanced: // Chunk is calculated in the switch above
916        break;
917      case kmp_sch_static_greedy:
918        cur_chunk = pr->u.p.parm1;
919        break;
920      case kmp_sch_dynamic_chunked:
921        schedtype = 1;
922        break;
923      case kmp_sch_guided_iterative_chunked:
924      case kmp_sch_guided_analytical_chunked:
925      case kmp_sch_guided_simd:
926        schedtype = 2;
927        break;
928      default:
929        // Should we put this case under "static"?
930        // case kmp_sch_static_steal:
931        schedtype = 3;
932        break;
933      }
934      __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
935    }
936#if KMP_USE_HIER_SCHED
937    if (pr->flags.use_hier) {
938      pr->u.p.count = 0;
939      pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
940    }
941#endif // KMP_USER_HIER_SCHED
942#endif /* USE_ITT_BUILD */
943  }
944
945#ifdef KMP_DEBUG
946  {
947    char *buff;
948    // create format specifiers before the debug output
949    buff = __kmp_str_format(
950        "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
951        "lb:%%%s ub:%%%s"
952        " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
953        " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
954        traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
955        traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
956        traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
957        traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
958    KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
959                  pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
960                  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
961                  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
962    __kmp_str_free(&buff);
963  }
964#endif
965#if (KMP_STATIC_STEAL_ENABLED)
966  // It cannot be guaranteed that after execution of a loop with some other
967  // schedule kind all the parm3 variables will contain the same value. Even if
968  // all parm3 will be the same, it still exists a bad case like using 0 and 1
969  // rather than program life-time increment. So the dedicated variable is
970  // required. The 'static_steal_counter' is used.
971  if (pr->schedule == kmp_sch_static_steal) {
972    // Other threads will inspect this variable when searching for a victim.
973    // This is a flag showing that other threads may steal from this thread
974    // since then.
975    volatile T *p = &pr->u.p.static_steal_counter;
976    *p = *p + 1;
977  }
978#endif // ( KMP_STATIC_STEAL_ENABLED )
979
980#if OMPT_SUPPORT && OMPT_OPTIONAL
981  if (ompt_enabled.ompt_callback_work) {
982    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
983    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
984    ompt_callbacks.ompt_callback(ompt_callback_work)(
985        ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
986        &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
987  }
988#endif
989  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
990}
991
992/* For ordered loops, either __kmp_dispatch_finish() should be called after
993 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
994 * every chunk of iterations.  If the ordered section(s) were not executed
995 * for this iteration (or every iteration in this chunk), we need to set the
996 * ordered iteration counters so that the next thread can proceed. */
997template <typename UT>
998static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
999  typedef typename traits_t<UT>::signed_t ST;
1000  kmp_info_t *th = __kmp_threads[gtid];
1001
1002  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1003  if (!th->th.th_team->t.t_serialized) {
1004
1005    dispatch_private_info_template<UT> *pr =
1006        reinterpret_cast<dispatch_private_info_template<UT> *>(
1007            th->th.th_dispatch->th_dispatch_pr_current);
1008    dispatch_shared_info_template<UT> volatile *sh =
1009        reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1010            th->th.th_dispatch->th_dispatch_sh_current);
1011    KMP_DEBUG_ASSERT(pr);
1012    KMP_DEBUG_ASSERT(sh);
1013    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1014                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1015
1016    if (pr->ordered_bumped) {
1017      KD_TRACE(
1018          1000,
1019          ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1020           gtid));
1021      pr->ordered_bumped = 0;
1022    } else {
1023      UT lower = pr->u.p.ordered_lower;
1024
1025#ifdef KMP_DEBUG
1026      {
1027        char *buff;
1028        // create format specifiers before the debug output
1029        buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1030                                "ordered_iteration:%%%s lower:%%%s\n",
1031                                traits_t<UT>::spec, traits_t<UT>::spec);
1032        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1033        __kmp_str_free(&buff);
1034      }
1035#endif
1036
1037      __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1038                     __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1039      KMP_MB(); /* is this necessary? */
1040#ifdef KMP_DEBUG
1041      {
1042        char *buff;
1043        // create format specifiers before the debug output
1044        buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1045                                "ordered_iteration:%%%s lower:%%%s\n",
1046                                traits_t<UT>::spec, traits_t<UT>::spec);
1047        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1048        __kmp_str_free(&buff);
1049      }
1050#endif
1051
1052      test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1053    } // if
1054  } // if
1055  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1056}
1057
1058#ifdef KMP_GOMP_COMPAT
1059
1060template <typename UT>
1061static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1062  typedef typename traits_t<UT>::signed_t ST;
1063  kmp_info_t *th = __kmp_threads[gtid];
1064
1065  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1066  if (!th->th.th_team->t.t_serialized) {
1067    //        int cid;
1068    dispatch_private_info_template<UT> *pr =
1069        reinterpret_cast<dispatch_private_info_template<UT> *>(
1070            th->th.th_dispatch->th_dispatch_pr_current);
1071    dispatch_shared_info_template<UT> volatile *sh =
1072        reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1073            th->th.th_dispatch->th_dispatch_sh_current);
1074    KMP_DEBUG_ASSERT(pr);
1075    KMP_DEBUG_ASSERT(sh);
1076    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1077                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1078
1079    //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1080    UT lower = pr->u.p.ordered_lower;
1081    UT upper = pr->u.p.ordered_upper;
1082    UT inc = upper - lower + 1;
1083
1084    if (pr->ordered_bumped == inc) {
1085      KD_TRACE(
1086          1000,
1087          ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1088           gtid));
1089      pr->ordered_bumped = 0;
1090    } else {
1091      inc -= pr->ordered_bumped;
1092
1093#ifdef KMP_DEBUG
1094      {
1095        char *buff;
1096        // create format specifiers before the debug output
1097        buff = __kmp_str_format(
1098            "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1099            "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1100            traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1101        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1102        __kmp_str_free(&buff);
1103      }
1104#endif
1105
1106      __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1107                     __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1108
1109      KMP_MB(); /* is this necessary? */
1110      KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1111                      "ordered_bumped to zero\n",
1112                      gtid));
1113      pr->ordered_bumped = 0;
1114//!!!!! TODO check if the inc should be unsigned, or signed???
1115#ifdef KMP_DEBUG
1116      {
1117        char *buff;
1118        // create format specifiers before the debug output
1119        buff = __kmp_str_format(
1120            "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1121            "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1122            traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1123            traits_t<UT>::spec);
1124        KD_TRACE(1000,
1125                 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1126        __kmp_str_free(&buff);
1127      }
1128#endif
1129
1130      test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1131    }
1132    //        }
1133  }
1134  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1135}
1136
1137#endif /* KMP_GOMP_COMPAT */
1138
1139template <typename T>
1140int __kmp_dispatch_next_algorithm(int gtid,
1141                                  dispatch_private_info_template<T> *pr,
1142                                  dispatch_shared_info_template<T> volatile *sh,
1143                                  kmp_int32 *p_last, T *p_lb, T *p_ub,
1144                                  typename traits_t<T>::signed_t *p_st, T nproc,
1145                                  T tid) {
1146  typedef typename traits_t<T>::unsigned_t UT;
1147  typedef typename traits_t<T>::signed_t ST;
1148  typedef typename traits_t<T>::floating_t DBL;
1149  int status = 0;
1150  kmp_int32 last = 0;
1151  T start;
1152  ST incr;
1153  UT limit, trip, init;
1154  kmp_info_t *th = __kmp_threads[gtid];
1155  kmp_team_t *team = th->th.th_team;
1156
1157  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1158                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1159  KMP_DEBUG_ASSERT(pr);
1160  KMP_DEBUG_ASSERT(sh);
1161  KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1162#ifdef KMP_DEBUG
1163  {
1164    char *buff;
1165    // create format specifiers before the debug output
1166    buff =
1167        __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1168                         "sh:%%p nproc:%%%s tid:%%%s\n",
1169                         traits_t<T>::spec, traits_t<T>::spec);
1170    KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1171    __kmp_str_free(&buff);
1172  }
1173#endif
1174
1175  // zero trip count
1176  if (pr->u.p.tc == 0) {
1177    KD_TRACE(10,
1178             ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1179              "zero status:%d\n",
1180              gtid, status));
1181    return 0;
1182  }
1183
1184  switch (pr->schedule) {
1185#if (KMP_STATIC_STEAL_ENABLED)
1186  case kmp_sch_static_steal: {
1187    T chunk = pr->u.p.parm1;
1188
1189    KD_TRACE(100,
1190             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1191              gtid));
1192
1193    trip = pr->u.p.tc - 1;
1194
1195    if (traits_t<T>::type_size > 4) {
1196      // use lock for 8-byte and CAS for 4-byte induction
1197      // variable. TODO (optional): check and use 16-byte CAS
1198      kmp_lock_t *lck = pr->u.p.th_steal_lock;
1199      KMP_DEBUG_ASSERT(lck != NULL);
1200      if (pr->u.p.count < (UT)pr->u.p.ub) {
1201        __kmp_acquire_lock(lck, gtid);
1202        // try to get own chunk of iterations
1203        init = (pr->u.p.count)++;
1204        status = (init < (UT)pr->u.p.ub);
1205        __kmp_release_lock(lck, gtid);
1206      } else {
1207        status = 0; // no own chunks
1208      }
1209      if (!status) { // try to steal
1210        kmp_info_t **other_threads = team->t.t_threads;
1211        int while_limit = pr->u.p.parm3;
1212        int while_index = 0;
1213        T id = pr->u.p.static_steal_counter; // loop id
1214        int idx = (th->th.th_dispatch->th_disp_index - 1) %
1215                  __kmp_dispatch_num_buffers; // current loop index
1216        // note: victim thread can potentially execute another loop
1217        // TODO: algorithm of searching for a victim
1218        // should be cleaned up and measured
1219        while ((!status) && (while_limit != ++while_index)) {
1220          dispatch_private_info_template<T> *victim;
1221          T remaining;
1222          T victimIdx = pr->u.p.parm4;
1223          T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1224          victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1225              &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1226          KMP_DEBUG_ASSERT(victim);
1227          while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1228                 oldVictimIdx != victimIdx) {
1229            victimIdx = (victimIdx + 1) % nproc;
1230            victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1231                &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1232            KMP_DEBUG_ASSERT(victim);
1233          }
1234          if (victim == pr || id != victim->u.p.static_steal_counter) {
1235            continue; // try once more (nproc attempts in total)
1236            // no victim is ready yet to participate in stealing
1237            // because no victim passed kmp_init_dispatch yet
1238          }
1239          if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1240            pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1241            continue; // not enough chunks to steal, goto next victim
1242          }
1243
1244          lck = victim->u.p.th_steal_lock;
1245          KMP_ASSERT(lck != NULL);
1246          __kmp_acquire_lock(lck, gtid);
1247          limit = victim->u.p.ub; // keep initial ub
1248          if (victim->u.p.count >= limit ||
1249              (remaining = limit - victim->u.p.count) < 2) {
1250            __kmp_release_lock(lck, gtid);
1251            pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1252            continue; // not enough chunks to steal
1253          }
1254          // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or
1255          // by 1
1256          if (remaining > 3) {
1257            // steal 1/4 of remaining
1258            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1259            init = (victim->u.p.ub -= (remaining >> 2));
1260          } else {
1261            // steal 1 chunk of 2 or 3 remaining
1262            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1263            init = (victim->u.p.ub -= 1);
1264          }
1265          __kmp_release_lock(lck, gtid);
1266
1267          KMP_DEBUG_ASSERT(init + 1 <= limit);
1268          pr->u.p.parm4 = victimIdx; // remember victim to steal from
1269          status = 1;
1270          while_index = 0;
1271          // now update own count and ub with stolen range but init chunk
1272          __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid);
1273          pr->u.p.count = init + 1;
1274          pr->u.p.ub = limit;
1275          __kmp_release_lock(pr->u.p.th_steal_lock, gtid);
1276        } // while (search for victim)
1277      } // if (try to find victim and steal)
1278    } else {
1279      // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1280      typedef union {
1281        struct {
1282          UT count;
1283          T ub;
1284        } p;
1285        kmp_int64 b;
1286      } union_i4;
1287      // All operations on 'count' or 'ub' must be combined atomically
1288      // together.
1289      {
1290        union_i4 vold, vnew;
1291        vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1292        vnew = vold;
1293        vnew.p.count++;
1294        while (!KMP_COMPARE_AND_STORE_ACQ64(
1295            (volatile kmp_int64 *)&pr->u.p.count,
1296            *VOLATILE_CAST(kmp_int64 *) & vold.b,
1297            *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1298          KMP_CPU_PAUSE();
1299          vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1300          vnew = vold;
1301          vnew.p.count++;
1302        }
1303        vnew = vold;
1304        init = vnew.p.count;
1305        status = (init < (UT)vnew.p.ub);
1306      }
1307
1308      if (!status) {
1309        kmp_info_t **other_threads = team->t.t_threads;
1310        int while_limit = pr->u.p.parm3;
1311        int while_index = 0;
1312        T id = pr->u.p.static_steal_counter; // loop id
1313        int idx = (th->th.th_dispatch->th_disp_index - 1) %
1314                  __kmp_dispatch_num_buffers; // current loop index
1315        // note: victim thread can potentially execute another loop
1316        // TODO: algorithm of searching for a victim
1317        // should be cleaned up and measured
1318        while ((!status) && (while_limit != ++while_index)) {
1319          dispatch_private_info_template<T> *victim;
1320          union_i4 vold, vnew;
1321          kmp_int32 remaining;
1322          T victimIdx = pr->u.p.parm4;
1323          T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1324          victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1325              &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1326          KMP_DEBUG_ASSERT(victim);
1327          while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1328                 oldVictimIdx != victimIdx) {
1329            victimIdx = (victimIdx + 1) % nproc;
1330            victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1331                &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1332            KMP_DEBUG_ASSERT(victim);
1333          }
1334          if (victim == pr || id != victim->u.p.static_steal_counter) {
1335            continue; // try once more (nproc attempts in total)
1336            // no victim is ready yet to participate in stealing
1337            // because no victim passed kmp_init_dispatch yet
1338          }
1339          pr->u.p.parm4 = victimIdx; // new victim found
1340          while (1) { // CAS loop if victim has enough chunks to steal
1341            vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1342            vnew = vold;
1343
1344            KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1345            if (vnew.p.count >= (UT)vnew.p.ub ||
1346                (remaining = vnew.p.ub - vnew.p.count) < 2) {
1347              pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1348              break; // not enough chunks to steal, goto next victim
1349            }
1350            if (remaining > 3) {
1351              vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1352            } else {
1353              vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1354            }
1355            KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1356            // TODO: Should this be acquire or release?
1357            if (KMP_COMPARE_AND_STORE_ACQ64(
1358                    (volatile kmp_int64 *)&victim->u.p.count,
1359                    *VOLATILE_CAST(kmp_int64 *) & vold.b,
1360                    *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1361              // stealing succeeded
1362              KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1363                                        vold.p.ub - vnew.p.ub);
1364              status = 1;
1365              while_index = 0;
1366              // now update own count and ub
1367              init = vnew.p.ub;
1368              vold.p.count = init + 1;
1369#if KMP_ARCH_X86
1370              KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1371#else
1372              *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1373#endif
1374              break;
1375            } // if (check CAS result)
1376            KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1377          } // while (try to steal from particular victim)
1378        } // while (search for victim)
1379      } // if (try to find victim and steal)
1380    } // if (4-byte induction variable)
1381    if (!status) {
1382      *p_lb = 0;
1383      *p_ub = 0;
1384      if (p_st != NULL)
1385        *p_st = 0;
1386    } else {
1387      start = pr->u.p.parm2;
1388      init *= chunk;
1389      limit = chunk + init - 1;
1390      incr = pr->u.p.st;
1391      KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1392
1393      KMP_DEBUG_ASSERT(init <= trip);
1394      if ((last = (limit >= trip)) != 0)
1395        limit = trip;
1396      if (p_st != NULL)
1397        *p_st = incr;
1398
1399      if (incr == 1) {
1400        *p_lb = start + init;
1401        *p_ub = start + limit;
1402      } else {
1403        *p_lb = start + init * incr;
1404        *p_ub = start + limit * incr;
1405      }
1406
1407      if (pr->flags.ordered) {
1408        pr->u.p.ordered_lower = init;
1409        pr->u.p.ordered_upper = limit;
1410      } // if
1411    } // if
1412    break;
1413  } // case
1414#endif // ( KMP_STATIC_STEAL_ENABLED )
1415  case kmp_sch_static_balanced: {
1416    KD_TRACE(
1417        10,
1418        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1419         gtid));
1420    /* check if thread has any iteration to do */
1421    if ((status = !pr->u.p.count) != 0) {
1422      pr->u.p.count = 1;
1423      *p_lb = pr->u.p.lb;
1424      *p_ub = pr->u.p.ub;
1425      last = pr->u.p.parm1;
1426      if (p_st != NULL)
1427        *p_st = pr->u.p.st;
1428    } else { /* no iterations to do */
1429      pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1430    }
1431  } // case
1432  break;
1433  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1434                                 merged here */
1435  case kmp_sch_static_chunked: {
1436    T parm1;
1437
1438    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1439                   "kmp_sch_static_[affinity|chunked] case\n",
1440                   gtid));
1441    parm1 = pr->u.p.parm1;
1442
1443    trip = pr->u.p.tc - 1;
1444    init = parm1 * (pr->u.p.count + tid);
1445
1446    if ((status = (init <= trip)) != 0) {
1447      start = pr->u.p.lb;
1448      incr = pr->u.p.st;
1449      limit = parm1 + init - 1;
1450
1451      if ((last = (limit >= trip)) != 0)
1452        limit = trip;
1453
1454      if (p_st != NULL)
1455        *p_st = incr;
1456
1457      pr->u.p.count += nproc;
1458
1459      if (incr == 1) {
1460        *p_lb = start + init;
1461        *p_ub = start + limit;
1462      } else {
1463        *p_lb = start + init * incr;
1464        *p_ub = start + limit * incr;
1465      }
1466
1467      if (pr->flags.ordered) {
1468        pr->u.p.ordered_lower = init;
1469        pr->u.p.ordered_upper = limit;
1470      } // if
1471    } // if
1472  } // case
1473  break;
1474
1475  case kmp_sch_dynamic_chunked: {
1476    T chunk = pr->u.p.parm1;
1477
1478    KD_TRACE(
1479        100,
1480        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1481         gtid));
1482
1483    init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1484    trip = pr->u.p.tc - 1;
1485
1486    if ((status = (init <= trip)) == 0) {
1487      *p_lb = 0;
1488      *p_ub = 0;
1489      if (p_st != NULL)
1490        *p_st = 0;
1491    } else {
1492      start = pr->u.p.lb;
1493      limit = chunk + init - 1;
1494      incr = pr->u.p.st;
1495
1496      if ((last = (limit >= trip)) != 0)
1497        limit = trip;
1498
1499      if (p_st != NULL)
1500        *p_st = incr;
1501
1502      if (incr == 1) {
1503        *p_lb = start + init;
1504        *p_ub = start + limit;
1505      } else {
1506        *p_lb = start + init * incr;
1507        *p_ub = start + limit * incr;
1508      }
1509
1510      if (pr->flags.ordered) {
1511        pr->u.p.ordered_lower = init;
1512        pr->u.p.ordered_upper = limit;
1513      } // if
1514    } // if
1515  } // case
1516  break;
1517
1518  case kmp_sch_guided_iterative_chunked: {
1519    T chunkspec = pr->u.p.parm1;
1520    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1521                   "iterative case\n",
1522                   gtid));
1523    trip = pr->u.p.tc;
1524    // Start atomic part of calculations
1525    while (1) {
1526      ST remaining; // signed, because can be < 0
1527      init = sh->u.s.iteration; // shared value
1528      remaining = trip - init;
1529      if (remaining <= 0) { // AC: need to compare with 0 first
1530        // nothing to do, don't try atomic op
1531        status = 0;
1532        break;
1533      }
1534      if ((T)remaining <
1535          pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1536        // use dynamic-style schedule
1537        // atomically increment iterations, get old value
1538        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1539                                 (ST)chunkspec);
1540        remaining = trip - init;
1541        if (remaining <= 0) {
1542          status = 0; // all iterations got by other threads
1543        } else {
1544          // got some iterations to work on
1545          status = 1;
1546          if ((T)remaining > chunkspec) {
1547            limit = init + chunkspec - 1;
1548          } else {
1549            last = 1; // the last chunk
1550            limit = init + remaining - 1;
1551          } // if
1552        } // if
1553        break;
1554      } // if
1555      limit = init +
1556              (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1557      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1558                               (ST)init, (ST)limit)) {
1559        // CAS was successful, chunk obtained
1560        status = 1;
1561        --limit;
1562        break;
1563      } // if
1564    } // while
1565    if (status != 0) {
1566      start = pr->u.p.lb;
1567      incr = pr->u.p.st;
1568      if (p_st != NULL)
1569        *p_st = incr;
1570      *p_lb = start + init * incr;
1571      *p_ub = start + limit * incr;
1572      if (pr->flags.ordered) {
1573        pr->u.p.ordered_lower = init;
1574        pr->u.p.ordered_upper = limit;
1575      } // if
1576    } else {
1577      *p_lb = 0;
1578      *p_ub = 0;
1579      if (p_st != NULL)
1580        *p_st = 0;
1581    } // if
1582  } // case
1583  break;
1584
1585  case kmp_sch_guided_simd: {
1586    // same as iterative but curr-chunk adjusted to be multiple of given
1587    // chunk
1588    T chunk = pr->u.p.parm1;
1589    KD_TRACE(100,
1590             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1591              gtid));
1592    trip = pr->u.p.tc;
1593    // Start atomic part of calculations
1594    while (1) {
1595      ST remaining; // signed, because can be < 0
1596      init = sh->u.s.iteration; // shared value
1597      remaining = trip - init;
1598      if (remaining <= 0) { // AC: need to compare with 0 first
1599        status = 0; // nothing to do, don't try atomic op
1600        break;
1601      }
1602      KMP_DEBUG_ASSERT(init % chunk == 0);
1603      // compare with K*nproc*(chunk+1), K=2 by default
1604      if ((T)remaining < pr->u.p.parm2) {
1605        // use dynamic-style schedule
1606        // atomically increment iterations, get old value
1607        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1608                                 (ST)chunk);
1609        remaining = trip - init;
1610        if (remaining <= 0) {
1611          status = 0; // all iterations got by other threads
1612        } else {
1613          // got some iterations to work on
1614          status = 1;
1615          if ((T)remaining > chunk) {
1616            limit = init + chunk - 1;
1617          } else {
1618            last = 1; // the last chunk
1619            limit = init + remaining - 1;
1620          } // if
1621        } // if
1622        break;
1623      } // if
1624      // divide by K*nproc
1625      UT span = remaining * (*(double *)&pr->u.p.parm3);
1626      UT rem = span % chunk;
1627      if (rem) // adjust so that span%chunk == 0
1628        span += chunk - rem;
1629      limit = init + span;
1630      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1631                               (ST)init, (ST)limit)) {
1632        // CAS was successful, chunk obtained
1633        status = 1;
1634        --limit;
1635        break;
1636      } // if
1637    } // while
1638    if (status != 0) {
1639      start = pr->u.p.lb;
1640      incr = pr->u.p.st;
1641      if (p_st != NULL)
1642        *p_st = incr;
1643      *p_lb = start + init * incr;
1644      *p_ub = start + limit * incr;
1645      if (pr->flags.ordered) {
1646        pr->u.p.ordered_lower = init;
1647        pr->u.p.ordered_upper = limit;
1648      } // if
1649    } else {
1650      *p_lb = 0;
1651      *p_ub = 0;
1652      if (p_st != NULL)
1653        *p_st = 0;
1654    } // if
1655  } // case
1656  break;
1657
1658  case kmp_sch_guided_analytical_chunked: {
1659    T chunkspec = pr->u.p.parm1;
1660    UT chunkIdx;
1661#if KMP_USE_X87CONTROL
1662    /* for storing original FPCW value for Windows* OS on
1663       IA-32 architecture 8-byte version */
1664    unsigned int oldFpcw;
1665    unsigned int fpcwSet = 0;
1666#endif
1667    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1668                   "kmp_sch_guided_analytical_chunked case\n",
1669                   gtid));
1670
1671    trip = pr->u.p.tc;
1672
1673    KMP_DEBUG_ASSERT(nproc > 1);
1674    KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1675
1676    while (1) { /* this while loop is a safeguard against unexpected zero
1677                   chunk sizes */
1678      chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1679      if (chunkIdx >= (UT)pr->u.p.parm2) {
1680        --trip;
1681        /* use dynamic-style scheduling */
1682        init = chunkIdx * chunkspec + pr->u.p.count;
1683        /* need to verify init > 0 in case of overflow in the above
1684         * calculation */
1685        if ((status = (init > 0 && init <= trip)) != 0) {
1686          limit = init + chunkspec - 1;
1687
1688          if ((last = (limit >= trip)) != 0)
1689            limit = trip;
1690        }
1691        break;
1692      } else {
1693/* use exponential-style scheduling */
1694/* The following check is to workaround the lack of long double precision on
1695   Windows* OS.
1696   This check works around the possible effect that init != 0 for chunkIdx == 0.
1697 */
1698#if KMP_USE_X87CONTROL
1699        /* If we haven't already done so, save original
1700           FPCW and set precision to 64-bit, as Windows* OS
1701           on IA-32 architecture defaults to 53-bit */
1702        if (!fpcwSet) {
1703          oldFpcw = _control87(0, 0);
1704          _control87(_PC_64, _MCW_PC);
1705          fpcwSet = 0x30000;
1706        }
1707#endif
1708        if (chunkIdx) {
1709          init = __kmp_dispatch_guided_remaining<T>(
1710              trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1711          KMP_DEBUG_ASSERT(init);
1712          init = trip - init;
1713        } else
1714          init = 0;
1715        limit = trip - __kmp_dispatch_guided_remaining<T>(
1716                           trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1717        KMP_ASSERT(init <= limit);
1718        if (init < limit) {
1719          KMP_DEBUG_ASSERT(limit <= trip);
1720          --limit;
1721          status = 1;
1722          break;
1723        } // if
1724      } // if
1725    } // while (1)
1726#if KMP_USE_X87CONTROL
1727    /* restore FPCW if necessary
1728       AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1729    */
1730    if (fpcwSet && (oldFpcw & fpcwSet))
1731      _control87(oldFpcw, _MCW_PC);
1732#endif
1733    if (status != 0) {
1734      start = pr->u.p.lb;
1735      incr = pr->u.p.st;
1736      if (p_st != NULL)
1737        *p_st = incr;
1738      *p_lb = start + init * incr;
1739      *p_ub = start + limit * incr;
1740      if (pr->flags.ordered) {
1741        pr->u.p.ordered_lower = init;
1742        pr->u.p.ordered_upper = limit;
1743      }
1744    } else {
1745      *p_lb = 0;
1746      *p_ub = 0;
1747      if (p_st != NULL)
1748        *p_st = 0;
1749    }
1750  } // case
1751  break;
1752
1753  case kmp_sch_trapezoidal: {
1754    UT index;
1755    T parm2 = pr->u.p.parm2;
1756    T parm3 = pr->u.p.parm3;
1757    T parm4 = pr->u.p.parm4;
1758    KD_TRACE(100,
1759             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1760              gtid));
1761
1762    index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1763
1764    init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1765    trip = pr->u.p.tc - 1;
1766
1767    if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1768      *p_lb = 0;
1769      *p_ub = 0;
1770      if (p_st != NULL)
1771        *p_st = 0;
1772    } else {
1773      start = pr->u.p.lb;
1774      limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1775      incr = pr->u.p.st;
1776
1777      if ((last = (limit >= trip)) != 0)
1778        limit = trip;
1779
1780      if (p_st != NULL)
1781        *p_st = incr;
1782
1783      if (incr == 1) {
1784        *p_lb = start + init;
1785        *p_ub = start + limit;
1786      } else {
1787        *p_lb = start + init * incr;
1788        *p_ub = start + limit * incr;
1789      }
1790
1791      if (pr->flags.ordered) {
1792        pr->u.p.ordered_lower = init;
1793        pr->u.p.ordered_upper = limit;
1794      } // if
1795    } // if
1796  } // case
1797  break;
1798  default: {
1799    status = 0; // to avoid complaints on uninitialized variable use
1800    __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1801                KMP_HNT(GetNewerLibrary), // Hint
1802                __kmp_msg_null // Variadic argument list terminator
1803                );
1804  } break;
1805  } // switch
1806  if (p_last)
1807    *p_last = last;
1808#ifdef KMP_DEBUG
1809  if (pr->flags.ordered) {
1810    char *buff;
1811    // create format specifiers before the debug output
1812    buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1813                            "ordered_lower:%%%s ordered_upper:%%%s\n",
1814                            traits_t<UT>::spec, traits_t<UT>::spec);
1815    KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1816    __kmp_str_free(&buff);
1817  }
1818  {
1819    char *buff;
1820    // create format specifiers before the debug output
1821    buff = __kmp_str_format(
1822        "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1823        "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1824        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1825    KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1826    __kmp_str_free(&buff);
1827  }
1828#endif
1829  return status;
1830}
1831
1832/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1833   work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1834   is not called. */
1835#if OMPT_SUPPORT && OMPT_OPTIONAL
1836#define OMPT_LOOP_END                                                          \
1837  if (status == 0) {                                                           \
1838    if (ompt_enabled.ompt_callback_work) {                                     \
1839      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1840      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1841      ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1842          ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1843          &(task_info->task_data), 0, codeptr);                                \
1844    }                                                                          \
1845  }
1846// TODO: implement count
1847#else
1848#define OMPT_LOOP_END // no-op
1849#endif
1850
1851#if KMP_STATS_ENABLED
1852#define KMP_STATS_LOOP_END                                                     \
1853  {                                                                            \
1854    kmp_int64 u, l, t, i;                                                      \
1855    l = (kmp_int64)(*p_lb);                                                    \
1856    u = (kmp_int64)(*p_ub);                                                    \
1857    i = (kmp_int64)(pr->u.p.st);                                               \
1858    if (status == 0) {                                                         \
1859      t = 0;                                                                   \
1860      KMP_POP_PARTITIONED_TIMER();                                             \
1861    } else if (i == 1) {                                                       \
1862      if (u >= l)                                                              \
1863        t = u - l + 1;                                                         \
1864      else                                                                     \
1865        t = 0;                                                                 \
1866    } else if (i < 0) {                                                        \
1867      if (l >= u)                                                              \
1868        t = (l - u) / (-i) + 1;                                                \
1869      else                                                                     \
1870        t = 0;                                                                 \
1871    } else {                                                                   \
1872      if (u >= l)                                                              \
1873        t = (u - l) / i + 1;                                                   \
1874      else                                                                     \
1875        t = 0;                                                                 \
1876    }                                                                          \
1877    KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1878  }
1879#else
1880#define KMP_STATS_LOOP_END /* Nothing */
1881#endif
1882
1883template <typename T>
1884static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1885                               T *p_lb, T *p_ub,
1886                               typename traits_t<T>::signed_t *p_st
1887#if OMPT_SUPPORT && OMPT_OPTIONAL
1888                               ,
1889                               void *codeptr
1890#endif
1891                               ) {
1892
1893  typedef typename traits_t<T>::unsigned_t UT;
1894  typedef typename traits_t<T>::signed_t ST;
1895  // This is potentially slightly misleading, schedule(runtime) will appear here
1896  // even if the actual runtime schedule is static. (Which points out a
1897  // disadvantage of schedule(runtime): even when static scheduling is used it
1898  // costs more than a compile time choice to use static scheduling would.)
1899  KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1900
1901  int status;
1902  dispatch_private_info_template<T> *pr;
1903  kmp_info_t *th = __kmp_threads[gtid];
1904  kmp_team_t *team = th->th.th_team;
1905
1906  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1907  KD_TRACE(
1908      1000,
1909      ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1910       gtid, p_lb, p_ub, p_st, p_last));
1911
1912  if (team->t.t_serialized) {
1913    /* NOTE: serialize this dispatch because we are not at the active level */
1914    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1915        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1916    KMP_DEBUG_ASSERT(pr);
1917
1918    if ((status = (pr->u.p.tc != 0)) == 0) {
1919      *p_lb = 0;
1920      *p_ub = 0;
1921      //            if ( p_last != NULL )
1922      //                *p_last = 0;
1923      if (p_st != NULL)
1924        *p_st = 0;
1925      if (__kmp_env_consistency_check) {
1926        if (pr->pushed_ws != ct_none) {
1927          pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1928        }
1929      }
1930    } else if (pr->flags.nomerge) {
1931      kmp_int32 last;
1932      T start;
1933      UT limit, trip, init;
1934      ST incr;
1935      T chunk = pr->u.p.parm1;
1936
1937      KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1938                     gtid));
1939
1940      init = chunk * pr->u.p.count++;
1941      trip = pr->u.p.tc - 1;
1942
1943      if ((status = (init <= trip)) == 0) {
1944        *p_lb = 0;
1945        *p_ub = 0;
1946        //                if ( p_last != NULL )
1947        //                    *p_last = 0;
1948        if (p_st != NULL)
1949          *p_st = 0;
1950        if (__kmp_env_consistency_check) {
1951          if (pr->pushed_ws != ct_none) {
1952            pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1953          }
1954        }
1955      } else {
1956        start = pr->u.p.lb;
1957        limit = chunk + init - 1;
1958        incr = pr->u.p.st;
1959
1960        if ((last = (limit >= trip)) != 0) {
1961          limit = trip;
1962#if KMP_OS_WINDOWS
1963          pr->u.p.last_upper = pr->u.p.ub;
1964#endif /* KMP_OS_WINDOWS */
1965        }
1966        if (p_last != NULL)
1967          *p_last = last;
1968        if (p_st != NULL)
1969          *p_st = incr;
1970        if (incr == 1) {
1971          *p_lb = start + init;
1972          *p_ub = start + limit;
1973        } else {
1974          *p_lb = start + init * incr;
1975          *p_ub = start + limit * incr;
1976        }
1977
1978        if (pr->flags.ordered) {
1979          pr->u.p.ordered_lower = init;
1980          pr->u.p.ordered_upper = limit;
1981#ifdef KMP_DEBUG
1982          {
1983            char *buff;
1984            // create format specifiers before the debug output
1985            buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1986                                    "ordered_lower:%%%s ordered_upper:%%%s\n",
1987                                    traits_t<UT>::spec, traits_t<UT>::spec);
1988            KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1989                            pr->u.p.ordered_upper));
1990            __kmp_str_free(&buff);
1991          }
1992#endif
1993        } // if
1994      } // if
1995    } else {
1996      pr->u.p.tc = 0;
1997      *p_lb = pr->u.p.lb;
1998      *p_ub = pr->u.p.ub;
1999#if KMP_OS_WINDOWS
2000      pr->u.p.last_upper = *p_ub;
2001#endif /* KMP_OS_WINDOWS */
2002      if (p_last != NULL)
2003        *p_last = TRUE;
2004      if (p_st != NULL)
2005        *p_st = pr->u.p.st;
2006    } // if
2007#ifdef KMP_DEBUG
2008    {
2009      char *buff;
2010      // create format specifiers before the debug output
2011      buff = __kmp_str_format(
2012          "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2013          "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2014          traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2015      KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
2016      __kmp_str_free(&buff);
2017    }
2018#endif
2019#if INCLUDE_SSC_MARKS
2020    SSC_MARK_DISPATCH_NEXT();
2021#endif
2022    OMPT_LOOP_END;
2023    KMP_STATS_LOOP_END;
2024    return status;
2025  } else {
2026    kmp_int32 last = 0;
2027    dispatch_shared_info_template<T> volatile *sh;
2028
2029    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2030                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2031
2032    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2033        th->th.th_dispatch->th_dispatch_pr_current);
2034    KMP_DEBUG_ASSERT(pr);
2035    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2036        th->th.th_dispatch->th_dispatch_sh_current);
2037    KMP_DEBUG_ASSERT(sh);
2038
2039#if KMP_USE_HIER_SCHED
2040    if (pr->flags.use_hier)
2041      status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2042    else
2043#endif // KMP_USE_HIER_SCHED
2044      status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2045                                                p_st, th->th.th_team_nproc,
2046                                                th->th.th_info.ds.ds_tid);
2047    // status == 0: no more iterations to execute
2048    if (status == 0) {
2049      UT num_done;
2050
2051      num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2052#ifdef KMP_DEBUG
2053      {
2054        char *buff;
2055        // create format specifiers before the debug output
2056        buff = __kmp_str_format(
2057            "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2058            traits_t<UT>::spec);
2059        KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2060        __kmp_str_free(&buff);
2061      }
2062#endif
2063
2064#if KMP_USE_HIER_SCHED
2065      pr->flags.use_hier = FALSE;
2066#endif
2067      if ((ST)num_done == th->th.th_team_nproc - 1) {
2068#if (KMP_STATIC_STEAL_ENABLED)
2069        if (pr->schedule == kmp_sch_static_steal &&
2070            traits_t<T>::type_size > 4) {
2071          int i;
2072          int idx = (th->th.th_dispatch->th_disp_index - 1) %
2073                    __kmp_dispatch_num_buffers; // current loop index
2074          kmp_info_t **other_threads = team->t.t_threads;
2075          // loop complete, safe to destroy locks used for stealing
2076          for (i = 0; i < th->th.th_team_nproc; ++i) {
2077            dispatch_private_info_template<T> *buf =
2078                reinterpret_cast<dispatch_private_info_template<T> *>(
2079                    &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]);
2080            kmp_lock_t *lck = buf->u.p.th_steal_lock;
2081            KMP_ASSERT(lck != NULL);
2082            __kmp_destroy_lock(lck);
2083            __kmp_free(lck);
2084            buf->u.p.th_steal_lock = NULL;
2085          }
2086        }
2087#endif
2088        /* NOTE: release this buffer to be reused */
2089
2090        KMP_MB(); /* Flush all pending memory write invalidates.  */
2091
2092        sh->u.s.num_done = 0;
2093        sh->u.s.iteration = 0;
2094
2095        /* TODO replace with general release procedure? */
2096        if (pr->flags.ordered) {
2097          sh->u.s.ordered_iteration = 0;
2098        }
2099
2100        KMP_MB(); /* Flush all pending memory write invalidates.  */
2101
2102        sh->buffer_index += __kmp_dispatch_num_buffers;
2103        KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2104                       gtid, sh->buffer_index));
2105
2106        KMP_MB(); /* Flush all pending memory write invalidates.  */
2107
2108      } // if
2109      if (__kmp_env_consistency_check) {
2110        if (pr->pushed_ws != ct_none) {
2111          pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2112        }
2113      }
2114
2115      th->th.th_dispatch->th_deo_fcn = NULL;
2116      th->th.th_dispatch->th_dxo_fcn = NULL;
2117      th->th.th_dispatch->th_dispatch_sh_current = NULL;
2118      th->th.th_dispatch->th_dispatch_pr_current = NULL;
2119    } // if (status == 0)
2120#if KMP_OS_WINDOWS
2121    else if (last) {
2122      pr->u.p.last_upper = pr->u.p.ub;
2123    }
2124#endif /* KMP_OS_WINDOWS */
2125    if (p_last != NULL && status != 0)
2126      *p_last = last;
2127  } // if
2128
2129#ifdef KMP_DEBUG
2130  {
2131    char *buff;
2132    // create format specifiers before the debug output
2133    buff = __kmp_str_format(
2134        "__kmp_dispatch_next: T#%%d normal case: "
2135        "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2136        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2137    KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2138                  (p_last ? *p_last : 0), status));
2139    __kmp_str_free(&buff);
2140  }
2141#endif
2142#if INCLUDE_SSC_MARKS
2143  SSC_MARK_DISPATCH_NEXT();
2144#endif
2145  OMPT_LOOP_END;
2146  KMP_STATS_LOOP_END;
2147  return status;
2148}
2149
2150template <typename T>
2151static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2152                                  kmp_int32 *plastiter, T *plower, T *pupper,
2153                                  typename traits_t<T>::signed_t incr) {
2154  typedef typename traits_t<T>::unsigned_t UT;
2155  kmp_uint32 team_id;
2156  kmp_uint32 nteams;
2157  UT trip_count;
2158  kmp_team_t *team;
2159  kmp_info_t *th;
2160
2161  KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2162  KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2163#ifdef KMP_DEBUG
2164  typedef typename traits_t<T>::signed_t ST;
2165  {
2166    char *buff;
2167    // create format specifiers before the debug output
2168    buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2169                            "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2170                            traits_t<T>::spec, traits_t<T>::spec,
2171                            traits_t<ST>::spec, traits_t<T>::spec);
2172    KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2173    __kmp_str_free(&buff);
2174  }
2175#endif
2176
2177  if (__kmp_env_consistency_check) {
2178    if (incr == 0) {
2179      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2180                            loc);
2181    }
2182    if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2183      // The loop is illegal.
2184      // Some zero-trip loops maintained by compiler, e.g.:
2185      //   for(i=10;i<0;++i) // lower >= upper - run-time check
2186      //   for(i=0;i>10;--i) // lower <= upper - run-time check
2187      //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2188      //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2189      // Compiler does not check the following illegal loops:
2190      //   for(i=0;i<10;i+=incr) // where incr<0
2191      //   for(i=10;i>0;i-=incr) // where incr<0
2192      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2193    }
2194  }
2195  th = __kmp_threads[gtid];
2196  team = th->th.th_team;
2197  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2198  nteams = th->th.th_teams_size.nteams;
2199  team_id = team->t.t_master_tid;
2200  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2201
2202  // compute global trip count
2203  if (incr == 1) {
2204    trip_count = *pupper - *plower + 1;
2205  } else if (incr == -1) {
2206    trip_count = *plower - *pupper + 1;
2207  } else if (incr > 0) {
2208    // upper-lower can exceed the limit of signed type
2209    trip_count = (UT)(*pupper - *plower) / incr + 1;
2210  } else {
2211    trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2212  }
2213
2214  if (trip_count <= nteams) {
2215    KMP_DEBUG_ASSERT(
2216        __kmp_static == kmp_sch_static_greedy ||
2217        __kmp_static ==
2218            kmp_sch_static_balanced); // Unknown static scheduling type.
2219    // only some teams get single iteration, others get nothing
2220    if (team_id < trip_count) {
2221      *pupper = *plower = *plower + team_id * incr;
2222    } else {
2223      *plower = *pupper + incr; // zero-trip loop
2224    }
2225    if (plastiter != NULL)
2226      *plastiter = (team_id == trip_count - 1);
2227  } else {
2228    if (__kmp_static == kmp_sch_static_balanced) {
2229      UT chunk = trip_count / nteams;
2230      UT extras = trip_count % nteams;
2231      *plower +=
2232          incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2233      *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2234      if (plastiter != NULL)
2235        *plastiter = (team_id == nteams - 1);
2236    } else {
2237      T chunk_inc_count =
2238          (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2239      T upper = *pupper;
2240      KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2241      // Unknown static scheduling type.
2242      *plower += team_id * chunk_inc_count;
2243      *pupper = *plower + chunk_inc_count - incr;
2244      // Check/correct bounds if needed
2245      if (incr > 0) {
2246        if (*pupper < *plower)
2247          *pupper = traits_t<T>::max_value;
2248        if (plastiter != NULL)
2249          *plastiter = *plower <= upper && *pupper > upper - incr;
2250        if (*pupper > upper)
2251          *pupper = upper; // tracker C73258
2252      } else {
2253        if (*pupper > *plower)
2254          *pupper = traits_t<T>::min_value;
2255        if (plastiter != NULL)
2256          *plastiter = *plower >= upper && *pupper < upper - incr;
2257        if (*pupper < upper)
2258          *pupper = upper; // tracker C73258
2259      }
2260    }
2261  }
2262}
2263
2264//-----------------------------------------------------------------------------
2265// Dispatch routines
2266//    Transfer call to template< type T >
2267//    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2268//                         T lb, T ub, ST st, ST chunk )
2269extern "C" {
2270
2271/*!
2272@ingroup WORK_SHARING
2273@{
2274@param loc Source location
2275@param gtid Global thread id
2276@param schedule Schedule type
2277@param lb  Lower bound
2278@param ub  Upper bound
2279@param st  Step (or increment if you prefer)
2280@param chunk The chunk size to block with
2281
2282This function prepares the runtime to start a dynamically scheduled for loop,
2283saving the loop arguments.
2284These functions are all identical apart from the types of the arguments.
2285*/
2286
2287void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2288                            enum sched_type schedule, kmp_int32 lb,
2289                            kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2290  KMP_DEBUG_ASSERT(__kmp_init_serial);
2291#if OMPT_SUPPORT && OMPT_OPTIONAL
2292  OMPT_STORE_RETURN_ADDRESS(gtid);
2293#endif
2294  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2295}
2296/*!
2297See @ref __kmpc_dispatch_init_4
2298*/
2299void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2300                             enum sched_type schedule, kmp_uint32 lb,
2301                             kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2302  KMP_DEBUG_ASSERT(__kmp_init_serial);
2303#if OMPT_SUPPORT && OMPT_OPTIONAL
2304  OMPT_STORE_RETURN_ADDRESS(gtid);
2305#endif
2306  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2307}
2308
2309/*!
2310See @ref __kmpc_dispatch_init_4
2311*/
2312void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2313                            enum sched_type schedule, kmp_int64 lb,
2314                            kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2315  KMP_DEBUG_ASSERT(__kmp_init_serial);
2316#if OMPT_SUPPORT && OMPT_OPTIONAL
2317  OMPT_STORE_RETURN_ADDRESS(gtid);
2318#endif
2319  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2320}
2321
2322/*!
2323See @ref __kmpc_dispatch_init_4
2324*/
2325void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2326                             enum sched_type schedule, kmp_uint64 lb,
2327                             kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2328  KMP_DEBUG_ASSERT(__kmp_init_serial);
2329#if OMPT_SUPPORT && OMPT_OPTIONAL
2330  OMPT_STORE_RETURN_ADDRESS(gtid);
2331#endif
2332  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2333}
2334
2335/*!
2336See @ref __kmpc_dispatch_init_4
2337
2338Difference from __kmpc_dispatch_init set of functions is these functions
2339are called for composite distribute parallel for construct. Thus before
2340regular iterations dispatching we need to calc per-team iteration space.
2341
2342These functions are all identical apart from the types of the arguments.
2343*/
2344void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2345                                 enum sched_type schedule, kmp_int32 *p_last,
2346                                 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2347                                 kmp_int32 chunk) {
2348  KMP_DEBUG_ASSERT(__kmp_init_serial);
2349#if OMPT_SUPPORT && OMPT_OPTIONAL
2350  OMPT_STORE_RETURN_ADDRESS(gtid);
2351#endif
2352  __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2353  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2354}
2355
2356void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2357                                  enum sched_type schedule, kmp_int32 *p_last,
2358                                  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2359                                  kmp_int32 chunk) {
2360  KMP_DEBUG_ASSERT(__kmp_init_serial);
2361#if OMPT_SUPPORT && OMPT_OPTIONAL
2362  OMPT_STORE_RETURN_ADDRESS(gtid);
2363#endif
2364  __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2365  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2366}
2367
2368void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2369                                 enum sched_type schedule, kmp_int32 *p_last,
2370                                 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2371                                 kmp_int64 chunk) {
2372  KMP_DEBUG_ASSERT(__kmp_init_serial);
2373#if OMPT_SUPPORT && OMPT_OPTIONAL
2374  OMPT_STORE_RETURN_ADDRESS(gtid);
2375#endif
2376  __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2377  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2378}
2379
2380void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2381                                  enum sched_type schedule, kmp_int32 *p_last,
2382                                  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2383                                  kmp_int64 chunk) {
2384  KMP_DEBUG_ASSERT(__kmp_init_serial);
2385#if OMPT_SUPPORT && OMPT_OPTIONAL
2386  OMPT_STORE_RETURN_ADDRESS(gtid);
2387#endif
2388  __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2389  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2390}
2391
2392/*!
2393@param loc Source code location
2394@param gtid Global thread id
2395@param p_last Pointer to a flag set to one if this is the last chunk or zero
2396otherwise
2397@param p_lb   Pointer to the lower bound for the next chunk of work
2398@param p_ub   Pointer to the upper bound for the next chunk of work
2399@param p_st   Pointer to the stride for the next chunk of work
2400@return one if there is work to be done, zero otherwise
2401
2402Get the next dynamically allocated chunk of work for this thread.
2403If there is no more work, then the lb,ub and stride need not be modified.
2404*/
2405int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2406                           kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2407#if OMPT_SUPPORT && OMPT_OPTIONAL
2408  OMPT_STORE_RETURN_ADDRESS(gtid);
2409#endif
2410  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2411#if OMPT_SUPPORT && OMPT_OPTIONAL
2412                                        ,
2413                                        OMPT_LOAD_RETURN_ADDRESS(gtid)
2414#endif
2415                                            );
2416}
2417
2418/*!
2419See @ref __kmpc_dispatch_next_4
2420*/
2421int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2422                            kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2423                            kmp_int32 *p_st) {
2424#if OMPT_SUPPORT && OMPT_OPTIONAL
2425  OMPT_STORE_RETURN_ADDRESS(gtid);
2426#endif
2427  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2428#if OMPT_SUPPORT && OMPT_OPTIONAL
2429                                         ,
2430                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2431#endif
2432                                             );
2433}
2434
2435/*!
2436See @ref __kmpc_dispatch_next_4
2437*/
2438int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2439                           kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2440#if OMPT_SUPPORT && OMPT_OPTIONAL
2441  OMPT_STORE_RETURN_ADDRESS(gtid);
2442#endif
2443  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2444#if OMPT_SUPPORT && OMPT_OPTIONAL
2445                                        ,
2446                                        OMPT_LOAD_RETURN_ADDRESS(gtid)
2447#endif
2448                                            );
2449}
2450
2451/*!
2452See @ref __kmpc_dispatch_next_4
2453*/
2454int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2455                            kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2456                            kmp_int64 *p_st) {
2457#if OMPT_SUPPORT && OMPT_OPTIONAL
2458  OMPT_STORE_RETURN_ADDRESS(gtid);
2459#endif
2460  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2461#if OMPT_SUPPORT && OMPT_OPTIONAL
2462                                         ,
2463                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2464#endif
2465                                             );
2466}
2467
2468/*!
2469@param loc Source code location
2470@param gtid Global thread id
2471
2472Mark the end of a dynamic loop.
2473*/
2474void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2475  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2476}
2477
2478/*!
2479See @ref __kmpc_dispatch_fini_4
2480*/
2481void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2482  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2483}
2484
2485/*!
2486See @ref __kmpc_dispatch_fini_4
2487*/
2488void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2489  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2490}
2491
2492/*!
2493See @ref __kmpc_dispatch_fini_4
2494*/
2495void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2496  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2497}
2498/*! @} */
2499
2500//-----------------------------------------------------------------------------
2501// Non-template routines from kmp_dispatch.cpp used in other sources
2502
2503kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2504  return value == checker;
2505}
2506
2507kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2508  return value != checker;
2509}
2510
2511kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2512  return value < checker;
2513}
2514
2515kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2516  return value >= checker;
2517}
2518
2519kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2520  return value <= checker;
2521}
2522
2523kmp_uint32
2524__kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2525             kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2526             void *obj // Higher-level synchronization object, or NULL.
2527             ) {
2528  // note: we may not belong to a team at this point
2529  volatile kmp_uint32 *spin = spinner;
2530  kmp_uint32 check = checker;
2531  kmp_uint32 spins;
2532  kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2533  kmp_uint32 r;
2534
2535  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2536  KMP_INIT_YIELD(spins);
2537  // main wait spin loop
2538  while (!f(r = TCR_4(*spin), check)) {
2539    KMP_FSYNC_SPIN_PREPARE(obj);
2540    /* GEH - remove this since it was accidentally introduced when kmp_wait was
2541       split. It causes problems with infinite recursion because of exit lock */
2542    /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2543        __kmp_abort_thread(); */
2544    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2545  }
2546  KMP_FSYNC_SPIN_ACQUIRED(obj);
2547  return r;
2548}
2549
2550void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2551                      kmp_uint32 (*pred)(void *, kmp_uint32),
2552                      void *obj // Higher-level synchronization object, or NULL.
2553                      ) {
2554  // note: we may not belong to a team at this point
2555  void *spin = spinner;
2556  kmp_uint32 check = checker;
2557  kmp_uint32 spins;
2558  kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2559
2560  KMP_FSYNC_SPIN_INIT(obj, spin);
2561  KMP_INIT_YIELD(spins);
2562  // main wait spin loop
2563  while (!f(spin, check)) {
2564    KMP_FSYNC_SPIN_PREPARE(obj);
2565    /* if we have waited a bit, or are noversubscribed, yield */
2566    /* pause is in the following code */
2567    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2568  }
2569  KMP_FSYNC_SPIN_ACQUIRED(obj);
2570}
2571
2572} // extern "C"
2573
2574#ifdef KMP_GOMP_COMPAT
2575
2576void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2577                               enum sched_type schedule, kmp_int32 lb,
2578                               kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2579                               int push_ws) {
2580  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2581                                 push_ws);
2582}
2583
2584void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2585                                enum sched_type schedule, kmp_uint32 lb,
2586                                kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2587                                int push_ws) {
2588  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2589                                  push_ws);
2590}
2591
2592void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2593                               enum sched_type schedule, kmp_int64 lb,
2594                               kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2595                               int push_ws) {
2596  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2597                                 push_ws);
2598}
2599
2600void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2601                                enum sched_type schedule, kmp_uint64 lb,
2602                                kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2603                                int push_ws) {
2604  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2605                                  push_ws);
2606}
2607
2608void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2609  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2610}
2611
2612void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2613  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2614}
2615
2616void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2617  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2618}
2619
2620void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2621  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2622}
2623
2624#endif /* KMP_GOMP_COMPAT */
2625
2626/* ------------------------------------------------------------------------ */
2627