kmp_dispatch.cpp revision 360784
1/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13/* Dynamic scheduling initialization and dispatch.
14 *
15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16 *       it may change values between parallel regions.  __kmp_max_nth
17 *       is the largest value __kmp_nth may take, 1 is the smallest.
18 */
19
20#include "kmp.h"
21#include "kmp_error.h"
22#include "kmp_i18n.h"
23#include "kmp_itt.h"
24#include "kmp_stats.h"
25#include "kmp_str.h"
26#if KMP_USE_X87CONTROL
27#include <float.h>
28#endif
29#include "kmp_lock.h"
30#include "kmp_dispatch.h"
31#if KMP_USE_HIER_SCHED
32#include "kmp_dispatch_hier.h"
33#endif
34
35#if OMPT_SUPPORT
36#include "ompt-specific.h"
37#endif
38
39/* ------------------------------------------------------------------------ */
40/* ------------------------------------------------------------------------ */
41
42void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43  kmp_info_t *th;
44
45  KMP_DEBUG_ASSERT(gtid_ref);
46
47  if (__kmp_env_consistency_check) {
48    th = __kmp_threads[*gtid_ref];
49    if (th->th.th_root->r.r_active &&
50        (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51#if KMP_USE_DYNAMIC_LOCK
52      __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53#else
54      __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55#endif
56    }
57  }
58}
59
60void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61  kmp_info_t *th;
62
63  if (__kmp_env_consistency_check) {
64    th = __kmp_threads[*gtid_ref];
65    if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66      __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67    }
68  }
69}
70
71// Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72static inline int __kmp_get_monotonicity(enum sched_type schedule,
73                                         bool use_hier = false) {
74  // Pick up the nonmonotonic/monotonic bits from the scheduling type
75  int monotonicity;
76  // default to monotonic
77  monotonicity = SCHEDULE_MONOTONIC;
78  if (SCHEDULE_HAS_NONMONOTONIC(schedule))
79    monotonicity = SCHEDULE_NONMONOTONIC;
80  else if (SCHEDULE_HAS_MONOTONIC(schedule))
81    monotonicity = SCHEDULE_MONOTONIC;
82  return monotonicity;
83}
84
85// Initialize a dispatch_private_info_template<T> buffer for a particular
86// type of schedule,chunk.  The loop description is found in lb (lower bound),
87// ub (upper bound), and st (stride).  nproc is the number of threads relevant
88// to the scheduling (often the number of threads in a team, but not always if
89// hierarchical scheduling is used).  tid is the id of the thread calling
90// the function within the group of nproc threads.  It will have a value
91// between 0 and nproc - 1.  This is often just the thread id within a team, but
92// is not necessarily the case when using hierarchical scheduling.
93// loc is the source file location of the corresponding loop
94// gtid is the global thread id
95template <typename T>
96void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
97                                   dispatch_private_info_template<T> *pr,
98                                   enum sched_type schedule, T lb, T ub,
99                                   typename traits_t<T>::signed_t st,
100#if USE_ITT_BUILD
101                                   kmp_uint64 *cur_chunk,
102#endif
103                                   typename traits_t<T>::signed_t chunk,
104                                   T nproc, T tid) {
105  typedef typename traits_t<T>::unsigned_t UT;
106  typedef typename traits_t<T>::floating_t DBL;
107
108  int active;
109  T tc;
110  kmp_info_t *th;
111  kmp_team_t *team;
112  int monotonicity;
113  bool use_hier;
114
115#ifdef KMP_DEBUG
116  typedef typename traits_t<T>::signed_t ST;
117  {
118    char *buff;
119    // create format specifiers before the debug output
120    buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
121                            "pr:%%p lb:%%%s ub:%%%s st:%%%s "
122                            "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
123                            traits_t<T>::spec, traits_t<T>::spec,
124                            traits_t<ST>::spec, traits_t<ST>::spec,
125                            traits_t<T>::spec, traits_t<T>::spec);
126    KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
127    __kmp_str_free(&buff);
128  }
129#endif
130  /* setup data */
131  th = __kmp_threads[gtid];
132  team = th->th.th_team;
133  active = !team->t.t_serialized;
134
135#if USE_ITT_BUILD
136  int itt_need_metadata_reporting =
137      __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
138      KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
139      team->t.t_active_level == 1;
140#endif
141
142#if KMP_USE_HIER_SCHED
143  use_hier = pr->flags.use_hier;
144#else
145  use_hier = false;
146#endif
147
148  /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
149  monotonicity = __kmp_get_monotonicity(schedule, use_hier);
150  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
151
152  /* Pick up the nomerge/ordered bits from the scheduling type */
153  if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
154    pr->flags.nomerge = TRUE;
155    schedule =
156        (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
157  } else {
158    pr->flags.nomerge = FALSE;
159  }
160  pr->type_size = traits_t<T>::type_size; // remember the size of variables
161  if (kmp_ord_lower & schedule) {
162    pr->flags.ordered = TRUE;
163    schedule =
164        (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
165  } else {
166    pr->flags.ordered = FALSE;
167  }
168  // Ordered overrides nonmonotonic
169  if (pr->flags.ordered) {
170    monotonicity = SCHEDULE_MONOTONIC;
171  }
172
173  if (schedule == kmp_sch_static) {
174    schedule = __kmp_static;
175  } else {
176    if (schedule == kmp_sch_runtime) {
177      // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
178      // not specified)
179      schedule = team->t.t_sched.r_sched_type;
180      monotonicity = __kmp_get_monotonicity(schedule, use_hier);
181      schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
182      // Detail the schedule if needed (global controls are differentiated
183      // appropriately)
184      if (schedule == kmp_sch_guided_chunked) {
185        schedule = __kmp_guided;
186      } else if (schedule == kmp_sch_static) {
187        schedule = __kmp_static;
188      }
189      // Use the chunk size specified by OMP_SCHEDULE (or default if not
190      // specified)
191      chunk = team->t.t_sched.chunk;
192#if USE_ITT_BUILD
193      if (cur_chunk)
194        *cur_chunk = chunk;
195#endif
196#ifdef KMP_DEBUG
197      {
198        char *buff;
199        // create format specifiers before the debug output
200        buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
201                                "schedule:%%d chunk:%%%s\n",
202                                traits_t<ST>::spec);
203        KD_TRACE(10, (buff, gtid, schedule, chunk));
204        __kmp_str_free(&buff);
205      }
206#endif
207    } else {
208      if (schedule == kmp_sch_guided_chunked) {
209        schedule = __kmp_guided;
210      }
211      if (chunk <= 0) {
212        chunk = KMP_DEFAULT_CHUNK;
213      }
214    }
215
216    if (schedule == kmp_sch_auto) {
217      // mapping and differentiation: in the __kmp_do_serial_initialize()
218      schedule = __kmp_auto;
219#ifdef KMP_DEBUG
220      {
221        char *buff;
222        // create format specifiers before the debug output
223        buff = __kmp_str_format(
224            "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
225            "schedule:%%d chunk:%%%s\n",
226            traits_t<ST>::spec);
227        KD_TRACE(10, (buff, gtid, schedule, chunk));
228        __kmp_str_free(&buff);
229      }
230#endif
231    }
232#if KMP_STATIC_STEAL_ENABLED
233    // map nonmonotonic:dynamic to static steal
234    if (schedule == kmp_sch_dynamic_chunked) {
235      if (monotonicity == SCHEDULE_NONMONOTONIC)
236        schedule = kmp_sch_static_steal;
237    }
238#endif
239    /* guided analytical not safe for too many threads */
240    if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
241      schedule = kmp_sch_guided_iterative_chunked;
242      KMP_WARNING(DispatchManyThreads);
243    }
244    if (schedule == kmp_sch_runtime_simd) {
245      // compiler provides simd_width in the chunk parameter
246      schedule = team->t.t_sched.r_sched_type;
247      monotonicity = __kmp_get_monotonicity(schedule, use_hier);
248      schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
249      // Detail the schedule if needed (global controls are differentiated
250      // appropriately)
251      if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
252          schedule == __kmp_static) {
253        schedule = kmp_sch_static_balanced_chunked;
254      } else {
255        if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
256          schedule = kmp_sch_guided_simd;
257        }
258        chunk = team->t.t_sched.chunk * chunk;
259      }
260#if USE_ITT_BUILD
261      if (cur_chunk)
262        *cur_chunk = chunk;
263#endif
264#ifdef KMP_DEBUG
265      {
266        char *buff;
267        // create format specifiers before the debug output
268        buff = __kmp_str_format(
269            "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
270            " chunk:%%%s\n",
271            traits_t<ST>::spec);
272        KD_TRACE(10, (buff, gtid, schedule, chunk));
273        __kmp_str_free(&buff);
274      }
275#endif
276    }
277    pr->u.p.parm1 = chunk;
278  }
279  KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
280              "unknown scheduling type");
281
282  pr->u.p.count = 0;
283
284  if (__kmp_env_consistency_check) {
285    if (st == 0) {
286      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
287                            (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
288    }
289  }
290  // compute trip count
291  if (st == 1) { // most common case
292    if (ub >= lb) {
293      tc = ub - lb + 1;
294    } else { // ub < lb
295      tc = 0; // zero-trip
296    }
297  } else if (st < 0) {
298    if (lb >= ub) {
299      // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
300      // where the division needs to be unsigned regardless of the result type
301      tc = (UT)(lb - ub) / (-st) + 1;
302    } else { // lb < ub
303      tc = 0; // zero-trip
304    }
305  } else { // st > 0
306    if (ub >= lb) {
307      // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
308      // where the division needs to be unsigned regardless of the result type
309      tc = (UT)(ub - lb) / st + 1;
310    } else { // ub < lb
311      tc = 0; // zero-trip
312    }
313  }
314
315#if KMP_STATS_ENABLED
316  if (KMP_MASTER_GTID(gtid)) {
317    KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
318  }
319#endif
320
321  pr->u.p.lb = lb;
322  pr->u.p.ub = ub;
323  pr->u.p.st = st;
324  pr->u.p.tc = tc;
325
326#if KMP_OS_WINDOWS
327  pr->u.p.last_upper = ub + st;
328#endif /* KMP_OS_WINDOWS */
329
330  /* NOTE: only the active parallel region(s) has active ordered sections */
331
332  if (active) {
333    if (pr->flags.ordered) {
334      pr->ordered_bumped = 0;
335      pr->u.p.ordered_lower = 1;
336      pr->u.p.ordered_upper = 0;
337    }
338  }
339
340  switch (schedule) {
341#if (KMP_STATIC_STEAL_ENABLED)
342  case kmp_sch_static_steal: {
343    T ntc, init;
344
345    KD_TRACE(100,
346             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
347              gtid));
348
349    ntc = (tc % chunk ? 1 : 0) + tc / chunk;
350    if (nproc > 1 && ntc >= nproc) {
351      KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
352      T id = tid;
353      T small_chunk, extras;
354
355      small_chunk = ntc / nproc;
356      extras = ntc % nproc;
357
358      init = id * small_chunk + (id < extras ? id : extras);
359      pr->u.p.count = init;
360      pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
361
362      pr->u.p.parm2 = lb;
363      // parm3 is the number of times to attempt stealing which is
364      // proportional to the number of chunks per thread up until
365      // the maximum value of nproc.
366      pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
367      pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
368      pr->u.p.st = st;
369      if (traits_t<T>::type_size > 4) {
370        // AC: TODO: check if 16-byte CAS available and use it to
371        // improve performance (probably wait for explicit request
372        // before spending time on this).
373        // For now use dynamically allocated per-thread lock,
374        // free memory in __kmp_dispatch_next when status==0.
375        KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
376        th->th.th_dispatch->th_steal_lock =
377            (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
378        __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
379      }
380      break;
381    } else {
382      /* too few chunks: switching to kmp_sch_dynamic_chunked */
383      schedule = kmp_sch_dynamic_chunked;
384      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
385                     "kmp_sch_dynamic_chunked\n",
386                      gtid));
387      if (pr->u.p.parm1 <= 0)
388        pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
389      break;
390    } // if
391  } // case
392#endif
393  case kmp_sch_static_balanced: {
394    T init, limit;
395
396    KD_TRACE(
397        100,
398        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
399         gtid));
400
401    if (nproc > 1) {
402      T id = tid;
403
404      if (tc < nproc) {
405        if (id < tc) {
406          init = id;
407          limit = id;
408          pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
409        } else {
410          pr->u.p.count = 1; /* means no more chunks to execute */
411          pr->u.p.parm1 = FALSE;
412          break;
413        }
414      } else {
415        T small_chunk = tc / nproc;
416        T extras = tc % nproc;
417        init = id * small_chunk + (id < extras ? id : extras);
418        limit = init + small_chunk - (id < extras ? 0 : 1);
419        pr->u.p.parm1 = (id == nproc - 1);
420      }
421    } else {
422      if (tc > 0) {
423        init = 0;
424        limit = tc - 1;
425        pr->u.p.parm1 = TRUE;
426      } else {
427        // zero trip count
428        pr->u.p.count = 1; /* means no more chunks to execute */
429        pr->u.p.parm1 = FALSE;
430        break;
431      }
432    }
433#if USE_ITT_BUILD
434    // Calculate chunk for metadata report
435    if (itt_need_metadata_reporting)
436      if (cur_chunk)
437        *cur_chunk = limit - init + 1;
438#endif
439    if (st == 1) {
440      pr->u.p.lb = lb + init;
441      pr->u.p.ub = lb + limit;
442    } else {
443      // calculated upper bound, "ub" is user-defined upper bound
444      T ub_tmp = lb + limit * st;
445      pr->u.p.lb = lb + init * st;
446      // adjust upper bound to "ub" if needed, so that MS lastprivate will match
447      // it exactly
448      if (st > 0) {
449        pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
450      } else {
451        pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
452      }
453    }
454    if (pr->flags.ordered) {
455      pr->u.p.ordered_lower = init;
456      pr->u.p.ordered_upper = limit;
457    }
458    break;
459  } // case
460  case kmp_sch_static_balanced_chunked: {
461    // similar to balanced, but chunk adjusted to multiple of simd width
462    T nth = nproc;
463    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
464                   " -> falling-through to static_greedy\n",
465                   gtid));
466    schedule = kmp_sch_static_greedy;
467    if (nth > 1)
468      pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
469    else
470      pr->u.p.parm1 = tc;
471    break;
472  } // case
473  case kmp_sch_guided_simd:
474  case kmp_sch_guided_iterative_chunked: {
475    KD_TRACE(
476        100,
477        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
478         " case\n",
479         gtid));
480
481    if (nproc > 1) {
482      if ((2L * chunk + 1) * nproc >= tc) {
483        /* chunk size too large, switch to dynamic */
484        schedule = kmp_sch_dynamic_chunked;
485      } else {
486        // when remaining iters become less than parm2 - switch to dynamic
487        pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
488        *(double *)&pr->u.p.parm3 =
489            guided_flt_param / nproc; // may occupy parm3 and parm4
490      }
491    } else {
492      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
493                     "kmp_sch_static_greedy\n",
494                     gtid));
495      schedule = kmp_sch_static_greedy;
496      /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
497      KD_TRACE(
498          100,
499          ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
500           gtid));
501      pr->u.p.parm1 = tc;
502    } // if
503  } // case
504  break;
505  case kmp_sch_guided_analytical_chunked: {
506    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
507                   "kmp_sch_guided_analytical_chunked case\n",
508                   gtid));
509
510    if (nproc > 1) {
511      if ((2L * chunk + 1) * nproc >= tc) {
512        /* chunk size too large, switch to dynamic */
513        schedule = kmp_sch_dynamic_chunked;
514      } else {
515        /* commonly used term: (2 nproc - 1)/(2 nproc) */
516        DBL x;
517
518#if KMP_USE_X87CONTROL
519        /* Linux* OS already has 64-bit computation by default for long double,
520           and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
521           Windows* OS on IA-32 architecture, we need to set precision to 64-bit
522           instead of the default 53-bit. Even though long double doesn't work
523           on Windows* OS on Intel(R) 64, the resulting lack of precision is not
524           expected to impact the correctness of the algorithm, but this has not
525           been mathematically proven. */
526        // save original FPCW and set precision to 64-bit, as
527        // Windows* OS on IA-32 architecture defaults to 53-bit
528        unsigned int oldFpcw = _control87(0, 0);
529        _control87(_PC_64, _MCW_PC); // 0,0x30000
530#endif
531        /* value used for comparison in solver for cross-over point */
532        long double target = ((long double)chunk * 2 + 1) * nproc / tc;
533
534        /* crossover point--chunk indexes equal to or greater than
535           this point switch to dynamic-style scheduling */
536        UT cross;
537
538        /* commonly used term: (2 nproc - 1)/(2 nproc) */
539        x = (long double)1.0 - (long double)0.5 / nproc;
540
541#ifdef KMP_DEBUG
542        { // test natural alignment
543          struct _test_a {
544            char a;
545            union {
546              char b;
547              DBL d;
548            };
549          } t;
550          ptrdiff_t natural_alignment =
551              (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
552          //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
553          // long)natural_alignment );
554          KMP_DEBUG_ASSERT(
555              (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
556        }
557#endif // KMP_DEBUG
558
559        /* save the term in thread private dispatch structure */
560        *(DBL *)&pr->u.p.parm3 = x;
561
562        /* solve for the crossover point to the nearest integer i for which C_i
563           <= chunk */
564        {
565          UT left, right, mid;
566          long double p;
567
568          /* estimate initial upper and lower bound */
569
570          /* doesn't matter what value right is as long as it is positive, but
571             it affects performance of the solver */
572          right = 229;
573          p = __kmp_pow<UT>(x, right);
574          if (p > target) {
575            do {
576              p *= p;
577              right <<= 1;
578            } while (p > target && right < (1 << 27));
579            /* lower bound is previous (failed) estimate of upper bound */
580            left = right >> 1;
581          } else {
582            left = 0;
583          }
584
585          /* bisection root-finding method */
586          while (left + 1 < right) {
587            mid = (left + right) / 2;
588            if (__kmp_pow<UT>(x, mid) > target) {
589              left = mid;
590            } else {
591              right = mid;
592            }
593          } // while
594          cross = right;
595        }
596        /* assert sanity of computed crossover point */
597        KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
598                   __kmp_pow<UT>(x, cross) <= target);
599
600        /* save the crossover point in thread private dispatch structure */
601        pr->u.p.parm2 = cross;
602
603// C75803
604#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
605#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
606#else
607#define GUIDED_ANALYTICAL_WORKAROUND (x)
608#endif
609        /* dynamic-style scheduling offset */
610        pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
611                                 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
612                        cross * chunk;
613#if KMP_USE_X87CONTROL
614        // restore FPCW
615        _control87(oldFpcw, _MCW_PC);
616#endif
617      } // if
618    } else {
619      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
620                     "kmp_sch_static_greedy\n",
621                     gtid));
622      schedule = kmp_sch_static_greedy;
623      /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
624      pr->u.p.parm1 = tc;
625    } // if
626  } // case
627  break;
628  case kmp_sch_static_greedy:
629    KD_TRACE(
630        100,
631        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
632         gtid));
633    pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
634    break;
635  case kmp_sch_static_chunked:
636  case kmp_sch_dynamic_chunked:
637    if (pr->u.p.parm1 <= 0) {
638      pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
639    }
640    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
641                   "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
642                   gtid));
643    break;
644  case kmp_sch_trapezoidal: {
645    /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
646
647    T parm1, parm2, parm3, parm4;
648    KD_TRACE(100,
649             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
650              gtid));
651
652    parm1 = chunk;
653
654    /* F : size of the first cycle */
655    parm2 = (tc / (2 * nproc));
656
657    if (parm2 < 1) {
658      parm2 = 1;
659    }
660
661    /* L : size of the last cycle.  Make sure the last cycle is not larger
662       than the first cycle. */
663    if (parm1 < 1) {
664      parm1 = 1;
665    } else if (parm1 > parm2) {
666      parm1 = parm2;
667    }
668
669    /* N : number of cycles */
670    parm3 = (parm2 + parm1);
671    parm3 = (2 * tc + parm3 - 1) / parm3;
672
673    if (parm3 < 2) {
674      parm3 = 2;
675    }
676
677    /* sigma : decreasing incr of the trapezoid */
678    parm4 = (parm3 - 1);
679    parm4 = (parm2 - parm1) / parm4;
680
681    // pointless check, because parm4 >= 0 always
682    // if ( parm4 < 0 ) {
683    //    parm4 = 0;
684    //}
685
686    pr->u.p.parm1 = parm1;
687    pr->u.p.parm2 = parm2;
688    pr->u.p.parm3 = parm3;
689    pr->u.p.parm4 = parm4;
690  } // case
691  break;
692
693  default: {
694    __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
695                KMP_HNT(GetNewerLibrary), // Hint
696                __kmp_msg_null // Variadic argument list terminator
697                );
698  } break;
699  } // switch
700  pr->schedule = schedule;
701}
702
703#if KMP_USE_HIER_SCHED
704template <typename T>
705inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
706                                             typename traits_t<T>::signed_t st);
707template <>
708inline void
709__kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
710                                            kmp_int32 ub, kmp_int32 st) {
711  __kmp_dispatch_init_hierarchy<kmp_int32>(
712      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
713      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
714}
715template <>
716inline void
717__kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
718                                             kmp_uint32 ub, kmp_int32 st) {
719  __kmp_dispatch_init_hierarchy<kmp_uint32>(
720      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
721      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
722}
723template <>
724inline void
725__kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
726                                            kmp_int64 ub, kmp_int64 st) {
727  __kmp_dispatch_init_hierarchy<kmp_int64>(
728      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
729      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
730}
731template <>
732inline void
733__kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
734                                             kmp_uint64 ub, kmp_int64 st) {
735  __kmp_dispatch_init_hierarchy<kmp_uint64>(
736      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
737      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
738}
739
740// free all the hierarchy scheduling memory associated with the team
741void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
742  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
743  for (int i = 0; i < num_disp_buff; ++i) {
744    // type does not matter here so use kmp_int32
745    auto sh =
746        reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
747            &team->t.t_disp_buffer[i]);
748    if (sh->hier) {
749      sh->hier->deallocate();
750      __kmp_free(sh->hier);
751    }
752  }
753}
754#endif
755
756// UT - unsigned flavor of T, ST - signed flavor of T,
757// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
758template <typename T>
759static void
760__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
761                    T ub, typename traits_t<T>::signed_t st,
762                    typename traits_t<T>::signed_t chunk, int push_ws) {
763  typedef typename traits_t<T>::unsigned_t UT;
764
765  int active;
766  kmp_info_t *th;
767  kmp_team_t *team;
768  kmp_uint32 my_buffer_index;
769  dispatch_private_info_template<T> *pr;
770  dispatch_shared_info_template<T> volatile *sh;
771
772  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
773                   sizeof(dispatch_private_info));
774  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
775                   sizeof(dispatch_shared_info));
776
777  if (!TCR_4(__kmp_init_parallel))
778    __kmp_parallel_initialize();
779
780  __kmp_resume_if_soft_paused();
781
782#if INCLUDE_SSC_MARKS
783  SSC_MARK_DISPATCH_INIT();
784#endif
785#ifdef KMP_DEBUG
786  typedef typename traits_t<T>::signed_t ST;
787  {
788    char *buff;
789    // create format specifiers before the debug output
790    buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
791                            "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
792                            traits_t<ST>::spec, traits_t<T>::spec,
793                            traits_t<T>::spec, traits_t<ST>::spec);
794    KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
795    __kmp_str_free(&buff);
796  }
797#endif
798  /* setup data */
799  th = __kmp_threads[gtid];
800  team = th->th.th_team;
801  active = !team->t.t_serialized;
802  th->th.th_ident = loc;
803
804  // Any half-decent optimizer will remove this test when the blocks are empty
805  // since the macros expand to nothing
806  // when statistics are disabled.
807  if (schedule == __kmp_static) {
808    KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
809  } else {
810    KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
811  }
812
813#if KMP_USE_HIER_SCHED
814  // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
815  // Hierarchical scheduling does not work with ordered, so if ordered is
816  // detected, then revert back to threaded scheduling.
817  bool ordered;
818  enum sched_type my_sched = schedule;
819  my_buffer_index = th->th.th_dispatch->th_disp_index;
820  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
821      &th->th.th_dispatch
822           ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
823  my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
824  if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
825    my_sched =
826        (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
827  ordered = (kmp_ord_lower & my_sched);
828  if (pr->flags.use_hier) {
829    if (ordered) {
830      KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
831                     "Disabling hierarchical scheduling.\n",
832                     gtid));
833      pr->flags.use_hier = FALSE;
834    }
835  }
836  if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
837    // Don't use hierarchical for ordered parallel loops and don't
838    // use the runtime hierarchy if one was specified in the program
839    if (!ordered && !pr->flags.use_hier)
840      __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
841  }
842#endif // KMP_USE_HIER_SCHED
843
844#if USE_ITT_BUILD
845  kmp_uint64 cur_chunk = chunk;
846  int itt_need_metadata_reporting =
847      __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
848      KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
849      team->t.t_active_level == 1;
850#endif
851  if (!active) {
852    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
853        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
854  } else {
855    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
856                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
857
858    my_buffer_index = th->th.th_dispatch->th_disp_index++;
859
860    /* What happens when number of threads changes, need to resize buffer? */
861    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
862        &th->th.th_dispatch
863             ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
864    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
865        &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
866    KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
867                  my_buffer_index));
868  }
869
870  __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
871#if USE_ITT_BUILD
872                                &cur_chunk,
873#endif
874                                chunk, (T)th->th.th_team_nproc,
875                                (T)th->th.th_info.ds.ds_tid);
876  if (active) {
877    if (pr->flags.ordered == 0) {
878      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
879      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
880    } else {
881      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
882      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
883    }
884  }
885
886  if (active) {
887    /* The name of this buffer should be my_buffer_index when it's free to use
888     * it */
889
890    KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
891                   "sh->buffer_index:%d\n",
892                   gtid, my_buffer_index, sh->buffer_index));
893    __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
894                           __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
895    // Note: KMP_WAIT() cannot be used there: buffer index and
896    // my_buffer_index are *always* 32-bit integers.
897    KMP_MB(); /* is this necessary? */
898    KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
899                   "sh->buffer_index:%d\n",
900                   gtid, my_buffer_index, sh->buffer_index));
901
902    th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
903    th->th.th_dispatch->th_dispatch_sh_current =
904        CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
905#if USE_ITT_BUILD
906    if (pr->flags.ordered) {
907      __kmp_itt_ordered_init(gtid);
908    }
909    // Report loop metadata
910    if (itt_need_metadata_reporting) {
911      // Only report metadata by master of active team at level 1
912      kmp_uint64 schedtype = 0;
913      switch (schedule) {
914      case kmp_sch_static_chunked:
915      case kmp_sch_static_balanced: // Chunk is calculated in the switch above
916        break;
917      case kmp_sch_static_greedy:
918        cur_chunk = pr->u.p.parm1;
919        break;
920      case kmp_sch_dynamic_chunked:
921        schedtype = 1;
922        break;
923      case kmp_sch_guided_iterative_chunked:
924      case kmp_sch_guided_analytical_chunked:
925      case kmp_sch_guided_simd:
926        schedtype = 2;
927        break;
928      default:
929        // Should we put this case under "static"?
930        // case kmp_sch_static_steal:
931        schedtype = 3;
932        break;
933      }
934      __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
935    }
936#if KMP_USE_HIER_SCHED
937    if (pr->flags.use_hier) {
938      pr->u.p.count = 0;
939      pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
940    }
941#endif // KMP_USER_HIER_SCHED
942#endif /* USE_ITT_BUILD */
943  }
944
945#ifdef KMP_DEBUG
946  {
947    char *buff;
948    // create format specifiers before the debug output
949    buff = __kmp_str_format(
950        "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
951        "lb:%%%s ub:%%%s"
952        " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
953        " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
954        traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
955        traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
956        traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
957        traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
958    KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
959                  pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
960                  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
961                  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
962    __kmp_str_free(&buff);
963  }
964#endif
965#if (KMP_STATIC_STEAL_ENABLED)
966  // It cannot be guaranteed that after execution of a loop with some other
967  // schedule kind all the parm3 variables will contain the same value. Even if
968  // all parm3 will be the same, it still exists a bad case like using 0 and 1
969  // rather than program life-time increment. So the dedicated variable is
970  // required. The 'static_steal_counter' is used.
971  if (schedule == kmp_sch_static_steal) {
972    // Other threads will inspect this variable when searching for a victim.
973    // This is a flag showing that other threads may steal from this thread
974    // since then.
975    volatile T *p = &pr->u.p.static_steal_counter;
976    *p = *p + 1;
977  }
978#endif // ( KMP_STATIC_STEAL_ENABLED )
979
980#if OMPT_SUPPORT && OMPT_OPTIONAL
981  if (ompt_enabled.ompt_callback_work) {
982    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
983    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
984    ompt_callbacks.ompt_callback(ompt_callback_work)(
985        ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
986        &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
987  }
988#endif
989  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
990}
991
992/* For ordered loops, either __kmp_dispatch_finish() should be called after
993 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
994 * every chunk of iterations.  If the ordered section(s) were not executed
995 * for this iteration (or every iteration in this chunk), we need to set the
996 * ordered iteration counters so that the next thread can proceed. */
997template <typename UT>
998static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
999  typedef typename traits_t<UT>::signed_t ST;
1000  kmp_info_t *th = __kmp_threads[gtid];
1001
1002  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1003  if (!th->th.th_team->t.t_serialized) {
1004
1005    dispatch_private_info_template<UT> *pr =
1006        reinterpret_cast<dispatch_private_info_template<UT> *>(
1007            th->th.th_dispatch->th_dispatch_pr_current);
1008    dispatch_shared_info_template<UT> volatile *sh =
1009        reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1010            th->th.th_dispatch->th_dispatch_sh_current);
1011    KMP_DEBUG_ASSERT(pr);
1012    KMP_DEBUG_ASSERT(sh);
1013    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1014                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1015
1016    if (pr->ordered_bumped) {
1017      KD_TRACE(
1018          1000,
1019          ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1020           gtid));
1021      pr->ordered_bumped = 0;
1022    } else {
1023      UT lower = pr->u.p.ordered_lower;
1024
1025#ifdef KMP_DEBUG
1026      {
1027        char *buff;
1028        // create format specifiers before the debug output
1029        buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1030                                "ordered_iteration:%%%s lower:%%%s\n",
1031                                traits_t<UT>::spec, traits_t<UT>::spec);
1032        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1033        __kmp_str_free(&buff);
1034      }
1035#endif
1036
1037      __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1038                     __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1039      KMP_MB(); /* is this necessary? */
1040#ifdef KMP_DEBUG
1041      {
1042        char *buff;
1043        // create format specifiers before the debug output
1044        buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1045                                "ordered_iteration:%%%s lower:%%%s\n",
1046                                traits_t<UT>::spec, traits_t<UT>::spec);
1047        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1048        __kmp_str_free(&buff);
1049      }
1050#endif
1051
1052      test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1053    } // if
1054  } // if
1055  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1056}
1057
1058#ifdef KMP_GOMP_COMPAT
1059
1060template <typename UT>
1061static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1062  typedef typename traits_t<UT>::signed_t ST;
1063  kmp_info_t *th = __kmp_threads[gtid];
1064
1065  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1066  if (!th->th.th_team->t.t_serialized) {
1067    //        int cid;
1068    dispatch_private_info_template<UT> *pr =
1069        reinterpret_cast<dispatch_private_info_template<UT> *>(
1070            th->th.th_dispatch->th_dispatch_pr_current);
1071    dispatch_shared_info_template<UT> volatile *sh =
1072        reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1073            th->th.th_dispatch->th_dispatch_sh_current);
1074    KMP_DEBUG_ASSERT(pr);
1075    KMP_DEBUG_ASSERT(sh);
1076    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1077                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1078
1079    //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1080    UT lower = pr->u.p.ordered_lower;
1081    UT upper = pr->u.p.ordered_upper;
1082    UT inc = upper - lower + 1;
1083
1084    if (pr->ordered_bumped == inc) {
1085      KD_TRACE(
1086          1000,
1087          ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1088           gtid));
1089      pr->ordered_bumped = 0;
1090    } else {
1091      inc -= pr->ordered_bumped;
1092
1093#ifdef KMP_DEBUG
1094      {
1095        char *buff;
1096        // create format specifiers before the debug output
1097        buff = __kmp_str_format(
1098            "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1099            "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1100            traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1101        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1102        __kmp_str_free(&buff);
1103      }
1104#endif
1105
1106      __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1107                     __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1108
1109      KMP_MB(); /* is this necessary? */
1110      KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1111                      "ordered_bumped to zero\n",
1112                      gtid));
1113      pr->ordered_bumped = 0;
1114//!!!!! TODO check if the inc should be unsigned, or signed???
1115#ifdef KMP_DEBUG
1116      {
1117        char *buff;
1118        // create format specifiers before the debug output
1119        buff = __kmp_str_format(
1120            "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1121            "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1122            traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1123            traits_t<UT>::spec);
1124        KD_TRACE(1000,
1125                 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1126        __kmp_str_free(&buff);
1127      }
1128#endif
1129
1130      test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1131    }
1132    //        }
1133  }
1134  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1135}
1136
1137#endif /* KMP_GOMP_COMPAT */
1138
1139template <typename T>
1140int __kmp_dispatch_next_algorithm(int gtid,
1141                                  dispatch_private_info_template<T> *pr,
1142                                  dispatch_shared_info_template<T> volatile *sh,
1143                                  kmp_int32 *p_last, T *p_lb, T *p_ub,
1144                                  typename traits_t<T>::signed_t *p_st, T nproc,
1145                                  T tid) {
1146  typedef typename traits_t<T>::unsigned_t UT;
1147  typedef typename traits_t<T>::signed_t ST;
1148  typedef typename traits_t<T>::floating_t DBL;
1149  int status = 0;
1150  kmp_int32 last = 0;
1151  T start;
1152  ST incr;
1153  UT limit, trip, init;
1154  kmp_info_t *th = __kmp_threads[gtid];
1155  kmp_team_t *team = th->th.th_team;
1156
1157  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1158                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1159  KMP_DEBUG_ASSERT(pr);
1160  KMP_DEBUG_ASSERT(sh);
1161  KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1162#ifdef KMP_DEBUG
1163  {
1164    char *buff;
1165    // create format specifiers before the debug output
1166    buff =
1167        __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1168                         "sh:%%p nproc:%%%s tid:%%%s\n",
1169                         traits_t<T>::spec, traits_t<T>::spec);
1170    KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1171    __kmp_str_free(&buff);
1172  }
1173#endif
1174
1175  // zero trip count
1176  if (pr->u.p.tc == 0) {
1177    KD_TRACE(10,
1178             ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1179              "zero status:%d\n",
1180              gtid, status));
1181    return 0;
1182  }
1183
1184  switch (pr->schedule) {
1185#if (KMP_STATIC_STEAL_ENABLED)
1186  case kmp_sch_static_steal: {
1187    T chunk = pr->u.p.parm1;
1188
1189    KD_TRACE(100,
1190             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1191              gtid));
1192
1193    trip = pr->u.p.tc - 1;
1194
1195    if (traits_t<T>::type_size > 4) {
1196      // use lock for 8-byte and CAS for 4-byte induction
1197      // variable. TODO (optional): check and use 16-byte CAS
1198      kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1199      KMP_DEBUG_ASSERT(lck != NULL);
1200      if (pr->u.p.count < (UT)pr->u.p.ub) {
1201        __kmp_acquire_lock(lck, gtid);
1202        // try to get own chunk of iterations
1203        init = (pr->u.p.count)++;
1204        status = (init < (UT)pr->u.p.ub);
1205        __kmp_release_lock(lck, gtid);
1206      } else {
1207        status = 0; // no own chunks
1208      }
1209      if (!status) { // try to steal
1210        kmp_info_t **other_threads = team->t.t_threads;
1211        int while_limit = pr->u.p.parm3;
1212        int while_index = 0;
1213        // TODO: algorithm of searching for a victim
1214        // should be cleaned up and measured
1215        while ((!status) && (while_limit != ++while_index)) {
1216          T remaining;
1217          T victimIdx = pr->u.p.parm4;
1218          T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1219          dispatch_private_info_template<T> *victim =
1220              reinterpret_cast<dispatch_private_info_template<T> *>(
1221                  other_threads[victimIdx]
1222                      ->th.th_dispatch->th_dispatch_pr_current);
1223          while ((victim == NULL || victim == pr ||
1224                  (*(volatile T *)&victim->u.p.static_steal_counter !=
1225                   *(volatile T *)&pr->u.p.static_steal_counter)) &&
1226                 oldVictimIdx != victimIdx) {
1227            victimIdx = (victimIdx + 1) % nproc;
1228            victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1229                other_threads[victimIdx]
1230                    ->th.th_dispatch->th_dispatch_pr_current);
1231          }
1232          if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1233                          *(volatile T *)&pr->u.p.static_steal_counter)) {
1234            continue; // try once more (nproc attempts in total)
1235            // no victim is ready yet to participate in stealing
1236            // because all victims are still in kmp_init_dispatch
1237          }
1238          if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1239            pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1240            continue; // not enough chunks to steal, goto next victim
1241          }
1242
1243          lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1244          KMP_ASSERT(lck != NULL);
1245          __kmp_acquire_lock(lck, gtid);
1246          limit = victim->u.p.ub; // keep initial ub
1247          if (victim->u.p.count >= limit ||
1248              (remaining = limit - victim->u.p.count) < 2) {
1249            __kmp_release_lock(lck, gtid);
1250            pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1251            continue; // not enough chunks to steal
1252          }
1253          // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
1254          // by 1
1255          if (remaining > 3) {
1256            // steal 1/4 of remaining
1257            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1258            init = (victim->u.p.ub -= (remaining >> 2));
1259          } else {
1260            // steal 1 chunk of 2 or 3 remaining
1261            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1262            init = (victim->u.p.ub -= 1);
1263          }
1264          __kmp_release_lock(lck, gtid);
1265
1266          KMP_DEBUG_ASSERT(init + 1 <= limit);
1267          pr->u.p.parm4 = victimIdx; // remember victim to steal from
1268          status = 1;
1269          while_index = 0;
1270          // now update own count and ub with stolen range but init chunk
1271          __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1272          pr->u.p.count = init + 1;
1273          pr->u.p.ub = limit;
1274          __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1275        } // while (search for victim)
1276      } // if (try to find victim and steal)
1277    } else {
1278      // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1279      typedef union {
1280        struct {
1281          UT count;
1282          T ub;
1283        } p;
1284        kmp_int64 b;
1285      } union_i4;
1286      // All operations on 'count' or 'ub' must be combined atomically
1287      // together.
1288      {
1289        union_i4 vold, vnew;
1290        vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1291        vnew = vold;
1292        vnew.p.count++;
1293        while (!KMP_COMPARE_AND_STORE_ACQ64(
1294            (volatile kmp_int64 *)&pr->u.p.count,
1295            *VOLATILE_CAST(kmp_int64 *) & vold.b,
1296            *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1297          KMP_CPU_PAUSE();
1298          vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1299          vnew = vold;
1300          vnew.p.count++;
1301        }
1302        vnew = vold;
1303        init = vnew.p.count;
1304        status = (init < (UT)vnew.p.ub);
1305      }
1306
1307      if (!status) {
1308        kmp_info_t **other_threads = team->t.t_threads;
1309        int while_limit = pr->u.p.parm3;
1310        int while_index = 0;
1311
1312        // TODO: algorithm of searching for a victim
1313        // should be cleaned up and measured
1314        while ((!status) && (while_limit != ++while_index)) {
1315          union_i4 vold, vnew;
1316          kmp_int32 remaining;
1317          T victimIdx = pr->u.p.parm4;
1318          T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1319          dispatch_private_info_template<T> *victim =
1320              reinterpret_cast<dispatch_private_info_template<T> *>(
1321                  other_threads[victimIdx]
1322                      ->th.th_dispatch->th_dispatch_pr_current);
1323          while ((victim == NULL || victim == pr ||
1324                  (*(volatile T *)&victim->u.p.static_steal_counter !=
1325                   *(volatile T *)&pr->u.p.static_steal_counter)) &&
1326                 oldVictimIdx != victimIdx) {
1327            victimIdx = (victimIdx + 1) % nproc;
1328            victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1329                other_threads[victimIdx]
1330                    ->th.th_dispatch->th_dispatch_pr_current);
1331          }
1332          if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1333                          *(volatile T *)&pr->u.p.static_steal_counter)) {
1334            continue; // try once more (nproc attempts in total)
1335            // no victim is ready yet to participate in stealing
1336            // because all victims are still in kmp_init_dispatch
1337          }
1338          pr->u.p.parm4 = victimIdx; // new victim found
1339          while (1) { // CAS loop if victim has enough chunks to steal
1340            vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1341            vnew = vold;
1342
1343            KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1344            if (vnew.p.count >= (UT)vnew.p.ub ||
1345                (remaining = vnew.p.ub - vnew.p.count) < 2) {
1346              pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1347              break; // not enough chunks to steal, goto next victim
1348            }
1349            if (remaining > 3) {
1350              vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1351            } else {
1352              vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1353            }
1354            KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1355            // TODO: Should this be acquire or release?
1356            if (KMP_COMPARE_AND_STORE_ACQ64(
1357                    (volatile kmp_int64 *)&victim->u.p.count,
1358                    *VOLATILE_CAST(kmp_int64 *) & vold.b,
1359                    *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1360              // stealing succedded
1361              KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1362                                        vold.p.ub - vnew.p.ub);
1363              status = 1;
1364              while_index = 0;
1365              // now update own count and ub
1366              init = vnew.p.ub;
1367              vold.p.count = init + 1;
1368#if KMP_ARCH_X86
1369              KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1370#else
1371              *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1372#endif
1373              break;
1374            } // if (check CAS result)
1375            KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1376          } // while (try to steal from particular victim)
1377        } // while (search for victim)
1378      } // if (try to find victim and steal)
1379    } // if (4-byte induction variable)
1380    if (!status) {
1381      *p_lb = 0;
1382      *p_ub = 0;
1383      if (p_st != NULL)
1384        *p_st = 0;
1385    } else {
1386      start = pr->u.p.parm2;
1387      init *= chunk;
1388      limit = chunk + init - 1;
1389      incr = pr->u.p.st;
1390      KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1391
1392      KMP_DEBUG_ASSERT(init <= trip);
1393      if ((last = (limit >= trip)) != 0)
1394        limit = trip;
1395      if (p_st != NULL)
1396        *p_st = incr;
1397
1398      if (incr == 1) {
1399        *p_lb = start + init;
1400        *p_ub = start + limit;
1401      } else {
1402        *p_lb = start + init * incr;
1403        *p_ub = start + limit * incr;
1404      }
1405
1406      if (pr->flags.ordered) {
1407        pr->u.p.ordered_lower = init;
1408        pr->u.p.ordered_upper = limit;
1409      } // if
1410    } // if
1411    break;
1412  } // case
1413#endif // ( KMP_STATIC_STEAL_ENABLED )
1414  case kmp_sch_static_balanced: {
1415    KD_TRACE(
1416        10,
1417        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1418         gtid));
1419    /* check if thread has any iteration to do */
1420    if ((status = !pr->u.p.count) != 0) {
1421      pr->u.p.count = 1;
1422      *p_lb = pr->u.p.lb;
1423      *p_ub = pr->u.p.ub;
1424      last = pr->u.p.parm1;
1425      if (p_st != NULL)
1426        *p_st = pr->u.p.st;
1427    } else { /* no iterations to do */
1428      pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1429    }
1430  } // case
1431  break;
1432  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1433                                 merged here */
1434  case kmp_sch_static_chunked: {
1435    T parm1;
1436
1437    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1438                   "kmp_sch_static_[affinity|chunked] case\n",
1439                   gtid));
1440    parm1 = pr->u.p.parm1;
1441
1442    trip = pr->u.p.tc - 1;
1443    init = parm1 * (pr->u.p.count + tid);
1444
1445    if ((status = (init <= trip)) != 0) {
1446      start = pr->u.p.lb;
1447      incr = pr->u.p.st;
1448      limit = parm1 + init - 1;
1449
1450      if ((last = (limit >= trip)) != 0)
1451        limit = trip;
1452
1453      if (p_st != NULL)
1454        *p_st = incr;
1455
1456      pr->u.p.count += nproc;
1457
1458      if (incr == 1) {
1459        *p_lb = start + init;
1460        *p_ub = start + limit;
1461      } else {
1462        *p_lb = start + init * incr;
1463        *p_ub = start + limit * incr;
1464      }
1465
1466      if (pr->flags.ordered) {
1467        pr->u.p.ordered_lower = init;
1468        pr->u.p.ordered_upper = limit;
1469      } // if
1470    } // if
1471  } // case
1472  break;
1473
1474  case kmp_sch_dynamic_chunked: {
1475    T chunk = pr->u.p.parm1;
1476
1477    KD_TRACE(
1478        100,
1479        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1480         gtid));
1481
1482    init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1483    trip = pr->u.p.tc - 1;
1484
1485    if ((status = (init <= trip)) == 0) {
1486      *p_lb = 0;
1487      *p_ub = 0;
1488      if (p_st != NULL)
1489        *p_st = 0;
1490    } else {
1491      start = pr->u.p.lb;
1492      limit = chunk + init - 1;
1493      incr = pr->u.p.st;
1494
1495      if ((last = (limit >= trip)) != 0)
1496        limit = trip;
1497
1498      if (p_st != NULL)
1499        *p_st = incr;
1500
1501      if (incr == 1) {
1502        *p_lb = start + init;
1503        *p_ub = start + limit;
1504      } else {
1505        *p_lb = start + init * incr;
1506        *p_ub = start + limit * incr;
1507      }
1508
1509      if (pr->flags.ordered) {
1510        pr->u.p.ordered_lower = init;
1511        pr->u.p.ordered_upper = limit;
1512      } // if
1513    } // if
1514  } // case
1515  break;
1516
1517  case kmp_sch_guided_iterative_chunked: {
1518    T chunkspec = pr->u.p.parm1;
1519    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1520                   "iterative case\n",
1521                   gtid));
1522    trip = pr->u.p.tc;
1523    // Start atomic part of calculations
1524    while (1) {
1525      ST remaining; // signed, because can be < 0
1526      init = sh->u.s.iteration; // shared value
1527      remaining = trip - init;
1528      if (remaining <= 0) { // AC: need to compare with 0 first
1529        // nothing to do, don't try atomic op
1530        status = 0;
1531        break;
1532      }
1533      if ((T)remaining <
1534          pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1535        // use dynamic-style shcedule
1536        // atomically increment iterations, get old value
1537        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1538                                 (ST)chunkspec);
1539        remaining = trip - init;
1540        if (remaining <= 0) {
1541          status = 0; // all iterations got by other threads
1542        } else {
1543          // got some iterations to work on
1544          status = 1;
1545          if ((T)remaining > chunkspec) {
1546            limit = init + chunkspec - 1;
1547          } else {
1548            last = 1; // the last chunk
1549            limit = init + remaining - 1;
1550          } // if
1551        } // if
1552        break;
1553      } // if
1554      limit = init +
1555              (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1556      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1557                               (ST)init, (ST)limit)) {
1558        // CAS was successful, chunk obtained
1559        status = 1;
1560        --limit;
1561        break;
1562      } // if
1563    } // while
1564    if (status != 0) {
1565      start = pr->u.p.lb;
1566      incr = pr->u.p.st;
1567      if (p_st != NULL)
1568        *p_st = incr;
1569      *p_lb = start + init * incr;
1570      *p_ub = start + limit * incr;
1571      if (pr->flags.ordered) {
1572        pr->u.p.ordered_lower = init;
1573        pr->u.p.ordered_upper = limit;
1574      } // if
1575    } else {
1576      *p_lb = 0;
1577      *p_ub = 0;
1578      if (p_st != NULL)
1579        *p_st = 0;
1580    } // if
1581  } // case
1582  break;
1583
1584  case kmp_sch_guided_simd: {
1585    // same as iterative but curr-chunk adjusted to be multiple of given
1586    // chunk
1587    T chunk = pr->u.p.parm1;
1588    KD_TRACE(100,
1589             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1590              gtid));
1591    trip = pr->u.p.tc;
1592    // Start atomic part of calculations
1593    while (1) {
1594      ST remaining; // signed, because can be < 0
1595      init = sh->u.s.iteration; // shared value
1596      remaining = trip - init;
1597      if (remaining <= 0) { // AC: need to compare with 0 first
1598        status = 0; // nothing to do, don't try atomic op
1599        break;
1600      }
1601      KMP_DEBUG_ASSERT(init % chunk == 0);
1602      // compare with K*nproc*(chunk+1), K=2 by default
1603      if ((T)remaining < pr->u.p.parm2) {
1604        // use dynamic-style shcedule
1605        // atomically increment iterations, get old value
1606        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1607                                 (ST)chunk);
1608        remaining = trip - init;
1609        if (remaining <= 0) {
1610          status = 0; // all iterations got by other threads
1611        } else {
1612          // got some iterations to work on
1613          status = 1;
1614          if ((T)remaining > chunk) {
1615            limit = init + chunk - 1;
1616          } else {
1617            last = 1; // the last chunk
1618            limit = init + remaining - 1;
1619          } // if
1620        } // if
1621        break;
1622      } // if
1623      // divide by K*nproc
1624      UT span = remaining * (*(double *)&pr->u.p.parm3);
1625      UT rem = span % chunk;
1626      if (rem) // adjust so that span%chunk == 0
1627        span += chunk - rem;
1628      limit = init + span;
1629      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1630                               (ST)init, (ST)limit)) {
1631        // CAS was successful, chunk obtained
1632        status = 1;
1633        --limit;
1634        break;
1635      } // if
1636    } // while
1637    if (status != 0) {
1638      start = pr->u.p.lb;
1639      incr = pr->u.p.st;
1640      if (p_st != NULL)
1641        *p_st = incr;
1642      *p_lb = start + init * incr;
1643      *p_ub = start + limit * incr;
1644      if (pr->flags.ordered) {
1645        pr->u.p.ordered_lower = init;
1646        pr->u.p.ordered_upper = limit;
1647      } // if
1648    } else {
1649      *p_lb = 0;
1650      *p_ub = 0;
1651      if (p_st != NULL)
1652        *p_st = 0;
1653    } // if
1654  } // case
1655  break;
1656
1657  case kmp_sch_guided_analytical_chunked: {
1658    T chunkspec = pr->u.p.parm1;
1659    UT chunkIdx;
1660#if KMP_USE_X87CONTROL
1661    /* for storing original FPCW value for Windows* OS on
1662       IA-32 architecture 8-byte version */
1663    unsigned int oldFpcw;
1664    unsigned int fpcwSet = 0;
1665#endif
1666    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1667                   "kmp_sch_guided_analytical_chunked case\n",
1668                   gtid));
1669
1670    trip = pr->u.p.tc;
1671
1672    KMP_DEBUG_ASSERT(nproc > 1);
1673    KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1674
1675    while (1) { /* this while loop is a safeguard against unexpected zero
1676                   chunk sizes */
1677      chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1678      if (chunkIdx >= (UT)pr->u.p.parm2) {
1679        --trip;
1680        /* use dynamic-style scheduling */
1681        init = chunkIdx * chunkspec + pr->u.p.count;
1682        /* need to verify init > 0 in case of overflow in the above
1683         * calculation */
1684        if ((status = (init > 0 && init <= trip)) != 0) {
1685          limit = init + chunkspec - 1;
1686
1687          if ((last = (limit >= trip)) != 0)
1688            limit = trip;
1689        }
1690        break;
1691      } else {
1692/* use exponential-style scheduling */
1693/* The following check is to workaround the lack of long double precision on
1694   Windows* OS.
1695   This check works around the possible effect that init != 0 for chunkIdx == 0.
1696 */
1697#if KMP_USE_X87CONTROL
1698        /* If we haven't already done so, save original
1699           FPCW and set precision to 64-bit, as Windows* OS
1700           on IA-32 architecture defaults to 53-bit */
1701        if (!fpcwSet) {
1702          oldFpcw = _control87(0, 0);
1703          _control87(_PC_64, _MCW_PC);
1704          fpcwSet = 0x30000;
1705        }
1706#endif
1707        if (chunkIdx) {
1708          init = __kmp_dispatch_guided_remaining<T>(
1709              trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1710          KMP_DEBUG_ASSERT(init);
1711          init = trip - init;
1712        } else
1713          init = 0;
1714        limit = trip - __kmp_dispatch_guided_remaining<T>(
1715                           trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1716        KMP_ASSERT(init <= limit);
1717        if (init < limit) {
1718          KMP_DEBUG_ASSERT(limit <= trip);
1719          --limit;
1720          status = 1;
1721          break;
1722        } // if
1723      } // if
1724    } // while (1)
1725#if KMP_USE_X87CONTROL
1726    /* restore FPCW if necessary
1727       AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1728    */
1729    if (fpcwSet && (oldFpcw & fpcwSet))
1730      _control87(oldFpcw, _MCW_PC);
1731#endif
1732    if (status != 0) {
1733      start = pr->u.p.lb;
1734      incr = pr->u.p.st;
1735      if (p_st != NULL)
1736        *p_st = incr;
1737      *p_lb = start + init * incr;
1738      *p_ub = start + limit * incr;
1739      if (pr->flags.ordered) {
1740        pr->u.p.ordered_lower = init;
1741        pr->u.p.ordered_upper = limit;
1742      }
1743    } else {
1744      *p_lb = 0;
1745      *p_ub = 0;
1746      if (p_st != NULL)
1747        *p_st = 0;
1748    }
1749  } // case
1750  break;
1751
1752  case kmp_sch_trapezoidal: {
1753    UT index;
1754    T parm2 = pr->u.p.parm2;
1755    T parm3 = pr->u.p.parm3;
1756    T parm4 = pr->u.p.parm4;
1757    KD_TRACE(100,
1758             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1759              gtid));
1760
1761    index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1762
1763    init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1764    trip = pr->u.p.tc - 1;
1765
1766    if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1767      *p_lb = 0;
1768      *p_ub = 0;
1769      if (p_st != NULL)
1770        *p_st = 0;
1771    } else {
1772      start = pr->u.p.lb;
1773      limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1774      incr = pr->u.p.st;
1775
1776      if ((last = (limit >= trip)) != 0)
1777        limit = trip;
1778
1779      if (p_st != NULL)
1780        *p_st = incr;
1781
1782      if (incr == 1) {
1783        *p_lb = start + init;
1784        *p_ub = start + limit;
1785      } else {
1786        *p_lb = start + init * incr;
1787        *p_ub = start + limit * incr;
1788      }
1789
1790      if (pr->flags.ordered) {
1791        pr->u.p.ordered_lower = init;
1792        pr->u.p.ordered_upper = limit;
1793      } // if
1794    } // if
1795  } // case
1796  break;
1797  default: {
1798    status = 0; // to avoid complaints on uninitialized variable use
1799    __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1800                KMP_HNT(GetNewerLibrary), // Hint
1801                __kmp_msg_null // Variadic argument list terminator
1802                );
1803  } break;
1804  } // switch
1805  if (p_last)
1806    *p_last = last;
1807#ifdef KMP_DEBUG
1808  if (pr->flags.ordered) {
1809    char *buff;
1810    // create format specifiers before the debug output
1811    buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1812                            "ordered_lower:%%%s ordered_upper:%%%s\n",
1813                            traits_t<UT>::spec, traits_t<UT>::spec);
1814    KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1815    __kmp_str_free(&buff);
1816  }
1817  {
1818    char *buff;
1819    // create format specifiers before the debug output
1820    buff = __kmp_str_format(
1821        "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1822        "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1823        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1824    KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1825    __kmp_str_free(&buff);
1826  }
1827#endif
1828  return status;
1829}
1830
1831/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1832   work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1833   is not called. */
1834#if OMPT_SUPPORT && OMPT_OPTIONAL
1835#define OMPT_LOOP_END                                                          \
1836  if (status == 0) {                                                           \
1837    if (ompt_enabled.ompt_callback_work) {                                     \
1838      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1839      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1840      ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1841          ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1842          &(task_info->task_data), 0, codeptr);                                \
1843    }                                                                          \
1844  }
1845// TODO: implement count
1846#else
1847#define OMPT_LOOP_END // no-op
1848#endif
1849
1850#if KMP_STATS_ENABLED
1851#define KMP_STATS_LOOP_END                                                     \
1852  {                                                                            \
1853    kmp_int64 u, l, t, i;                                                      \
1854    l = (kmp_int64)(*p_lb);                                                    \
1855    u = (kmp_int64)(*p_ub);                                                    \
1856    i = (kmp_int64)(pr->u.p.st);                                               \
1857    if (status == 0) {                                                         \
1858      t = 0;                                                                   \
1859      KMP_POP_PARTITIONED_TIMER();                                             \
1860    } else if (i == 1) {                                                       \
1861      if (u >= l)                                                              \
1862        t = u - l + 1;                                                         \
1863      else                                                                     \
1864        t = 0;                                                                 \
1865    } else if (i < 0) {                                                        \
1866      if (l >= u)                                                              \
1867        t = (l - u) / (-i) + 1;                                                \
1868      else                                                                     \
1869        t = 0;                                                                 \
1870    } else {                                                                   \
1871      if (u >= l)                                                              \
1872        t = (u - l) / i + 1;                                                   \
1873      else                                                                     \
1874        t = 0;                                                                 \
1875    }                                                                          \
1876    KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1877  }
1878#else
1879#define KMP_STATS_LOOP_END /* Nothing */
1880#endif
1881
1882template <typename T>
1883static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1884                               T *p_lb, T *p_ub,
1885                               typename traits_t<T>::signed_t *p_st
1886#if OMPT_SUPPORT && OMPT_OPTIONAL
1887                               ,
1888                               void *codeptr
1889#endif
1890                               ) {
1891
1892  typedef typename traits_t<T>::unsigned_t UT;
1893  typedef typename traits_t<T>::signed_t ST;
1894  // This is potentially slightly misleading, schedule(runtime) will appear here
1895  // even if the actual runtme schedule is static. (Which points out a
1896  // disadvantage of schedule(runtime): even when static scheduling is used it
1897  // costs more than a compile time choice to use static scheduling would.)
1898  KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1899
1900  int status;
1901  dispatch_private_info_template<T> *pr;
1902  kmp_info_t *th = __kmp_threads[gtid];
1903  kmp_team_t *team = th->th.th_team;
1904
1905  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1906  KD_TRACE(
1907      1000,
1908      ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1909       gtid, p_lb, p_ub, p_st, p_last));
1910
1911  if (team->t.t_serialized) {
1912    /* NOTE: serialize this dispatch becase we are not at the active level */
1913    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1914        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1915    KMP_DEBUG_ASSERT(pr);
1916
1917    if ((status = (pr->u.p.tc != 0)) == 0) {
1918      *p_lb = 0;
1919      *p_ub = 0;
1920      //            if ( p_last != NULL )
1921      //                *p_last = 0;
1922      if (p_st != NULL)
1923        *p_st = 0;
1924      if (__kmp_env_consistency_check) {
1925        if (pr->pushed_ws != ct_none) {
1926          pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1927        }
1928      }
1929    } else if (pr->flags.nomerge) {
1930      kmp_int32 last;
1931      T start;
1932      UT limit, trip, init;
1933      ST incr;
1934      T chunk = pr->u.p.parm1;
1935
1936      KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1937                     gtid));
1938
1939      init = chunk * pr->u.p.count++;
1940      trip = pr->u.p.tc - 1;
1941
1942      if ((status = (init <= trip)) == 0) {
1943        *p_lb = 0;
1944        *p_ub = 0;
1945        //                if ( p_last != NULL )
1946        //                    *p_last = 0;
1947        if (p_st != NULL)
1948          *p_st = 0;
1949        if (__kmp_env_consistency_check) {
1950          if (pr->pushed_ws != ct_none) {
1951            pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1952          }
1953        }
1954      } else {
1955        start = pr->u.p.lb;
1956        limit = chunk + init - 1;
1957        incr = pr->u.p.st;
1958
1959        if ((last = (limit >= trip)) != 0) {
1960          limit = trip;
1961#if KMP_OS_WINDOWS
1962          pr->u.p.last_upper = pr->u.p.ub;
1963#endif /* KMP_OS_WINDOWS */
1964        }
1965        if (p_last != NULL)
1966          *p_last = last;
1967        if (p_st != NULL)
1968          *p_st = incr;
1969        if (incr == 1) {
1970          *p_lb = start + init;
1971          *p_ub = start + limit;
1972        } else {
1973          *p_lb = start + init * incr;
1974          *p_ub = start + limit * incr;
1975        }
1976
1977        if (pr->flags.ordered) {
1978          pr->u.p.ordered_lower = init;
1979          pr->u.p.ordered_upper = limit;
1980#ifdef KMP_DEBUG
1981          {
1982            char *buff;
1983            // create format specifiers before the debug output
1984            buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1985                                    "ordered_lower:%%%s ordered_upper:%%%s\n",
1986                                    traits_t<UT>::spec, traits_t<UT>::spec);
1987            KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1988                            pr->u.p.ordered_upper));
1989            __kmp_str_free(&buff);
1990          }
1991#endif
1992        } // if
1993      } // if
1994    } else {
1995      pr->u.p.tc = 0;
1996      *p_lb = pr->u.p.lb;
1997      *p_ub = pr->u.p.ub;
1998#if KMP_OS_WINDOWS
1999      pr->u.p.last_upper = *p_ub;
2000#endif /* KMP_OS_WINDOWS */
2001      if (p_last != NULL)
2002        *p_last = TRUE;
2003      if (p_st != NULL)
2004        *p_st = pr->u.p.st;
2005    } // if
2006#ifdef KMP_DEBUG
2007    {
2008      char *buff;
2009      // create format specifiers before the debug output
2010      buff = __kmp_str_format(
2011          "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2012          "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2013          traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2014      KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
2015      __kmp_str_free(&buff);
2016    }
2017#endif
2018#if INCLUDE_SSC_MARKS
2019    SSC_MARK_DISPATCH_NEXT();
2020#endif
2021    OMPT_LOOP_END;
2022    KMP_STATS_LOOP_END;
2023    return status;
2024  } else {
2025    kmp_int32 last = 0;
2026    dispatch_shared_info_template<T> volatile *sh;
2027
2028    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2029                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2030
2031    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2032        th->th.th_dispatch->th_dispatch_pr_current);
2033    KMP_DEBUG_ASSERT(pr);
2034    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2035        th->th.th_dispatch->th_dispatch_sh_current);
2036    KMP_DEBUG_ASSERT(sh);
2037
2038#if KMP_USE_HIER_SCHED
2039    if (pr->flags.use_hier)
2040      status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2041    else
2042#endif // KMP_USE_HIER_SCHED
2043      status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2044                                                p_st, th->th.th_team_nproc,
2045                                                th->th.th_info.ds.ds_tid);
2046    // status == 0: no more iterations to execute
2047    if (status == 0) {
2048      UT num_done;
2049
2050      num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2051#ifdef KMP_DEBUG
2052      {
2053        char *buff;
2054        // create format specifiers before the debug output
2055        buff = __kmp_str_format(
2056            "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2057            traits_t<UT>::spec);
2058        KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2059        __kmp_str_free(&buff);
2060      }
2061#endif
2062
2063#if KMP_USE_HIER_SCHED
2064      pr->flags.use_hier = FALSE;
2065#endif
2066      if ((ST)num_done == th->th.th_team_nproc - 1) {
2067#if (KMP_STATIC_STEAL_ENABLED)
2068        if (pr->schedule == kmp_sch_static_steal &&
2069            traits_t<T>::type_size > 4) {
2070          int i;
2071          kmp_info_t **other_threads = team->t.t_threads;
2072          // loop complete, safe to destroy locks used for stealing
2073          for (i = 0; i < th->th.th_team_nproc; ++i) {
2074            kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2075            KMP_ASSERT(lck != NULL);
2076            __kmp_destroy_lock(lck);
2077            __kmp_free(lck);
2078            other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2079          }
2080        }
2081#endif
2082        /* NOTE: release this buffer to be reused */
2083
2084        KMP_MB(); /* Flush all pending memory write invalidates.  */
2085
2086        sh->u.s.num_done = 0;
2087        sh->u.s.iteration = 0;
2088
2089        /* TODO replace with general release procedure? */
2090        if (pr->flags.ordered) {
2091          sh->u.s.ordered_iteration = 0;
2092        }
2093
2094        KMP_MB(); /* Flush all pending memory write invalidates.  */
2095
2096        sh->buffer_index += __kmp_dispatch_num_buffers;
2097        KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2098                       gtid, sh->buffer_index));
2099
2100        KMP_MB(); /* Flush all pending memory write invalidates.  */
2101
2102      } // if
2103      if (__kmp_env_consistency_check) {
2104        if (pr->pushed_ws != ct_none) {
2105          pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2106        }
2107      }
2108
2109      th->th.th_dispatch->th_deo_fcn = NULL;
2110      th->th.th_dispatch->th_dxo_fcn = NULL;
2111      th->th.th_dispatch->th_dispatch_sh_current = NULL;
2112      th->th.th_dispatch->th_dispatch_pr_current = NULL;
2113    } // if (status == 0)
2114#if KMP_OS_WINDOWS
2115    else if (last) {
2116      pr->u.p.last_upper = pr->u.p.ub;
2117    }
2118#endif /* KMP_OS_WINDOWS */
2119    if (p_last != NULL && status != 0)
2120      *p_last = last;
2121  } // if
2122
2123#ifdef KMP_DEBUG
2124  {
2125    char *buff;
2126    // create format specifiers before the debug output
2127    buff = __kmp_str_format(
2128        "__kmp_dispatch_next: T#%%d normal case: "
2129        "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2130        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2131    KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2132                  (p_last ? *p_last : 0), status));
2133    __kmp_str_free(&buff);
2134  }
2135#endif
2136#if INCLUDE_SSC_MARKS
2137  SSC_MARK_DISPATCH_NEXT();
2138#endif
2139  OMPT_LOOP_END;
2140  KMP_STATS_LOOP_END;
2141  return status;
2142}
2143
2144template <typename T>
2145static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2146                                  kmp_int32 *plastiter, T *plower, T *pupper,
2147                                  typename traits_t<T>::signed_t incr) {
2148  typedef typename traits_t<T>::unsigned_t UT;
2149  kmp_uint32 team_id;
2150  kmp_uint32 nteams;
2151  UT trip_count;
2152  kmp_team_t *team;
2153  kmp_info_t *th;
2154
2155  KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2156  KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2157#ifdef KMP_DEBUG
2158  typedef typename traits_t<T>::signed_t ST;
2159  {
2160    char *buff;
2161    // create format specifiers before the debug output
2162    buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2163                            "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2164                            traits_t<T>::spec, traits_t<T>::spec,
2165                            traits_t<ST>::spec, traits_t<T>::spec);
2166    KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2167    __kmp_str_free(&buff);
2168  }
2169#endif
2170
2171  if (__kmp_env_consistency_check) {
2172    if (incr == 0) {
2173      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2174                            loc);
2175    }
2176    if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2177      // The loop is illegal.
2178      // Some zero-trip loops maintained by compiler, e.g.:
2179      //   for(i=10;i<0;++i) // lower >= upper - run-time check
2180      //   for(i=0;i>10;--i) // lower <= upper - run-time check
2181      //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2182      //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2183      // Compiler does not check the following illegal loops:
2184      //   for(i=0;i<10;i+=incr) // where incr<0
2185      //   for(i=10;i>0;i-=incr) // where incr<0
2186      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2187    }
2188  }
2189  th = __kmp_threads[gtid];
2190  team = th->th.th_team;
2191  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2192  nteams = th->th.th_teams_size.nteams;
2193  team_id = team->t.t_master_tid;
2194  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2195
2196  // compute global trip count
2197  if (incr == 1) {
2198    trip_count = *pupper - *plower + 1;
2199  } else if (incr == -1) {
2200    trip_count = *plower - *pupper + 1;
2201  } else if (incr > 0) {
2202    // upper-lower can exceed the limit of signed type
2203    trip_count = (UT)(*pupper - *plower) / incr + 1;
2204  } else {
2205    trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2206  }
2207
2208  if (trip_count <= nteams) {
2209    KMP_DEBUG_ASSERT(
2210        __kmp_static == kmp_sch_static_greedy ||
2211        __kmp_static ==
2212            kmp_sch_static_balanced); // Unknown static scheduling type.
2213    // only some teams get single iteration, others get nothing
2214    if (team_id < trip_count) {
2215      *pupper = *plower = *plower + team_id * incr;
2216    } else {
2217      *plower = *pupper + incr; // zero-trip loop
2218    }
2219    if (plastiter != NULL)
2220      *plastiter = (team_id == trip_count - 1);
2221  } else {
2222    if (__kmp_static == kmp_sch_static_balanced) {
2223      UT chunk = trip_count / nteams;
2224      UT extras = trip_count % nteams;
2225      *plower +=
2226          incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2227      *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2228      if (plastiter != NULL)
2229        *plastiter = (team_id == nteams - 1);
2230    } else {
2231      T chunk_inc_count =
2232          (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2233      T upper = *pupper;
2234      KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2235      // Unknown static scheduling type.
2236      *plower += team_id * chunk_inc_count;
2237      *pupper = *plower + chunk_inc_count - incr;
2238      // Check/correct bounds if needed
2239      if (incr > 0) {
2240        if (*pupper < *plower)
2241          *pupper = traits_t<T>::max_value;
2242        if (plastiter != NULL)
2243          *plastiter = *plower <= upper && *pupper > upper - incr;
2244        if (*pupper > upper)
2245          *pupper = upper; // tracker C73258
2246      } else {
2247        if (*pupper > *plower)
2248          *pupper = traits_t<T>::min_value;
2249        if (plastiter != NULL)
2250          *plastiter = *plower >= upper && *pupper < upper - incr;
2251        if (*pupper < upper)
2252          *pupper = upper; // tracker C73258
2253      }
2254    }
2255  }
2256}
2257
2258//-----------------------------------------------------------------------------
2259// Dispatch routines
2260//    Transfer call to template< type T >
2261//    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2262//                         T lb, T ub, ST st, ST chunk )
2263extern "C" {
2264
2265/*!
2266@ingroup WORK_SHARING
2267@{
2268@param loc Source location
2269@param gtid Global thread id
2270@param schedule Schedule type
2271@param lb  Lower bound
2272@param ub  Upper bound
2273@param st  Step (or increment if you prefer)
2274@param chunk The chunk size to block with
2275
2276This function prepares the runtime to start a dynamically scheduled for loop,
2277saving the loop arguments.
2278These functions are all identical apart from the types of the arguments.
2279*/
2280
2281void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2282                            enum sched_type schedule, kmp_int32 lb,
2283                            kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2284  KMP_DEBUG_ASSERT(__kmp_init_serial);
2285#if OMPT_SUPPORT && OMPT_OPTIONAL
2286  OMPT_STORE_RETURN_ADDRESS(gtid);
2287#endif
2288  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2289}
2290/*!
2291See @ref __kmpc_dispatch_init_4
2292*/
2293void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2294                             enum sched_type schedule, kmp_uint32 lb,
2295                             kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2296  KMP_DEBUG_ASSERT(__kmp_init_serial);
2297#if OMPT_SUPPORT && OMPT_OPTIONAL
2298  OMPT_STORE_RETURN_ADDRESS(gtid);
2299#endif
2300  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2301}
2302
2303/*!
2304See @ref __kmpc_dispatch_init_4
2305*/
2306void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2307                            enum sched_type schedule, kmp_int64 lb,
2308                            kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2309  KMP_DEBUG_ASSERT(__kmp_init_serial);
2310#if OMPT_SUPPORT && OMPT_OPTIONAL
2311  OMPT_STORE_RETURN_ADDRESS(gtid);
2312#endif
2313  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2314}
2315
2316/*!
2317See @ref __kmpc_dispatch_init_4
2318*/
2319void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2320                             enum sched_type schedule, kmp_uint64 lb,
2321                             kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2322  KMP_DEBUG_ASSERT(__kmp_init_serial);
2323#if OMPT_SUPPORT && OMPT_OPTIONAL
2324  OMPT_STORE_RETURN_ADDRESS(gtid);
2325#endif
2326  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2327}
2328
2329/*!
2330See @ref __kmpc_dispatch_init_4
2331
2332Difference from __kmpc_dispatch_init set of functions is these functions
2333are called for composite distribute parallel for construct. Thus before
2334regular iterations dispatching we need to calc per-team iteration space.
2335
2336These functions are all identical apart from the types of the arguments.
2337*/
2338void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2339                                 enum sched_type schedule, kmp_int32 *p_last,
2340                                 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2341                                 kmp_int32 chunk) {
2342  KMP_DEBUG_ASSERT(__kmp_init_serial);
2343#if OMPT_SUPPORT && OMPT_OPTIONAL
2344  OMPT_STORE_RETURN_ADDRESS(gtid);
2345#endif
2346  __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2347  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2348}
2349
2350void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2351                                  enum sched_type schedule, kmp_int32 *p_last,
2352                                  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2353                                  kmp_int32 chunk) {
2354  KMP_DEBUG_ASSERT(__kmp_init_serial);
2355#if OMPT_SUPPORT && OMPT_OPTIONAL
2356  OMPT_STORE_RETURN_ADDRESS(gtid);
2357#endif
2358  __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2359  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2360}
2361
2362void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2363                                 enum sched_type schedule, kmp_int32 *p_last,
2364                                 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2365                                 kmp_int64 chunk) {
2366  KMP_DEBUG_ASSERT(__kmp_init_serial);
2367#if OMPT_SUPPORT && OMPT_OPTIONAL
2368  OMPT_STORE_RETURN_ADDRESS(gtid);
2369#endif
2370  __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2371  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2372}
2373
2374void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2375                                  enum sched_type schedule, kmp_int32 *p_last,
2376                                  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2377                                  kmp_int64 chunk) {
2378  KMP_DEBUG_ASSERT(__kmp_init_serial);
2379#if OMPT_SUPPORT && OMPT_OPTIONAL
2380  OMPT_STORE_RETURN_ADDRESS(gtid);
2381#endif
2382  __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2383  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2384}
2385
2386/*!
2387@param loc Source code location
2388@param gtid Global thread id
2389@param p_last Pointer to a flag set to one if this is the last chunk or zero
2390otherwise
2391@param p_lb   Pointer to the lower bound for the next chunk of work
2392@param p_ub   Pointer to the upper bound for the next chunk of work
2393@param p_st   Pointer to the stride for the next chunk of work
2394@return one if there is work to be done, zero otherwise
2395
2396Get the next dynamically allocated chunk of work for this thread.
2397If there is no more work, then the lb,ub and stride need not be modified.
2398*/
2399int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2400                           kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2401#if OMPT_SUPPORT && OMPT_OPTIONAL
2402  OMPT_STORE_RETURN_ADDRESS(gtid);
2403#endif
2404  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2405#if OMPT_SUPPORT && OMPT_OPTIONAL
2406                                        ,
2407                                        OMPT_LOAD_RETURN_ADDRESS(gtid)
2408#endif
2409                                            );
2410}
2411
2412/*!
2413See @ref __kmpc_dispatch_next_4
2414*/
2415int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2416                            kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2417                            kmp_int32 *p_st) {
2418#if OMPT_SUPPORT && OMPT_OPTIONAL
2419  OMPT_STORE_RETURN_ADDRESS(gtid);
2420#endif
2421  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2422#if OMPT_SUPPORT && OMPT_OPTIONAL
2423                                         ,
2424                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2425#endif
2426                                             );
2427}
2428
2429/*!
2430See @ref __kmpc_dispatch_next_4
2431*/
2432int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2433                           kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2434#if OMPT_SUPPORT && OMPT_OPTIONAL
2435  OMPT_STORE_RETURN_ADDRESS(gtid);
2436#endif
2437  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2438#if OMPT_SUPPORT && OMPT_OPTIONAL
2439                                        ,
2440                                        OMPT_LOAD_RETURN_ADDRESS(gtid)
2441#endif
2442                                            );
2443}
2444
2445/*!
2446See @ref __kmpc_dispatch_next_4
2447*/
2448int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2449                            kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2450                            kmp_int64 *p_st) {
2451#if OMPT_SUPPORT && OMPT_OPTIONAL
2452  OMPT_STORE_RETURN_ADDRESS(gtid);
2453#endif
2454  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2455#if OMPT_SUPPORT && OMPT_OPTIONAL
2456                                         ,
2457                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2458#endif
2459                                             );
2460}
2461
2462/*!
2463@param loc Source code location
2464@param gtid Global thread id
2465
2466Mark the end of a dynamic loop.
2467*/
2468void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2469  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2470}
2471
2472/*!
2473See @ref __kmpc_dispatch_fini_4
2474*/
2475void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2476  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2477}
2478
2479/*!
2480See @ref __kmpc_dispatch_fini_4
2481*/
2482void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2483  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2484}
2485
2486/*!
2487See @ref __kmpc_dispatch_fini_4
2488*/
2489void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2490  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2491}
2492/*! @} */
2493
2494//-----------------------------------------------------------------------------
2495// Non-template routines from kmp_dispatch.cpp used in other sources
2496
2497kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2498  return value == checker;
2499}
2500
2501kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2502  return value != checker;
2503}
2504
2505kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2506  return value < checker;
2507}
2508
2509kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2510  return value >= checker;
2511}
2512
2513kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2514  return value <= checker;
2515}
2516
2517kmp_uint32
2518__kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2519             kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2520             void *obj // Higher-level synchronization object, or NULL.
2521             ) {
2522  // note: we may not belong to a team at this point
2523  volatile kmp_uint32 *spin = spinner;
2524  kmp_uint32 check = checker;
2525  kmp_uint32 spins;
2526  kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2527  kmp_uint32 r;
2528
2529  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2530  KMP_INIT_YIELD(spins);
2531  // main wait spin loop
2532  while (!f(r = TCR_4(*spin), check)) {
2533    KMP_FSYNC_SPIN_PREPARE(obj);
2534    /* GEH - remove this since it was accidentally introduced when kmp_wait was
2535       split. It causes problems with infinite recursion because of exit lock */
2536    /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2537        __kmp_abort_thread(); */
2538    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2539  }
2540  KMP_FSYNC_SPIN_ACQUIRED(obj);
2541  return r;
2542}
2543
2544void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2545                      kmp_uint32 (*pred)(void *, kmp_uint32),
2546                      void *obj // Higher-level synchronization object, or NULL.
2547                      ) {
2548  // note: we may not belong to a team at this point
2549  void *spin = spinner;
2550  kmp_uint32 check = checker;
2551  kmp_uint32 spins;
2552  kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2553
2554  KMP_FSYNC_SPIN_INIT(obj, spin);
2555  KMP_INIT_YIELD(spins);
2556  // main wait spin loop
2557  while (!f(spin, check)) {
2558    KMP_FSYNC_SPIN_PREPARE(obj);
2559    /* if we have waited a bit, or are noversubscribed, yield */
2560    /* pause is in the following code */
2561    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2562  }
2563  KMP_FSYNC_SPIN_ACQUIRED(obj);
2564}
2565
2566} // extern "C"
2567
2568#ifdef KMP_GOMP_COMPAT
2569
2570void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2571                               enum sched_type schedule, kmp_int32 lb,
2572                               kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2573                               int push_ws) {
2574  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2575                                 push_ws);
2576}
2577
2578void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2579                                enum sched_type schedule, kmp_uint32 lb,
2580                                kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2581                                int push_ws) {
2582  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2583                                  push_ws);
2584}
2585
2586void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2587                               enum sched_type schedule, kmp_int64 lb,
2588                               kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2589                               int push_ws) {
2590  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2591                                 push_ws);
2592}
2593
2594void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2595                                enum sched_type schedule, kmp_uint64 lb,
2596                                kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2597                                int push_ws) {
2598  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2599                                  push_ws);
2600}
2601
2602void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2603  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2604}
2605
2606void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2607  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2608}
2609
2610void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2611  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2612}
2613
2614void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2615  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2616}
2617
2618#endif /* KMP_GOMP_COMPAT */
2619
2620/* ------------------------------------------------------------------------ */
2621