1/*
2 * kmp_collapse.cpp -- loop collapse feature
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_error.h"
15#include "kmp_i18n.h"
16#include "kmp_itt.h"
17#include "kmp_stats.h"
18#include "kmp_str.h"
19#include "kmp_collapse.h"
20
21#if OMPT_SUPPORT
22#include "ompt-specific.h"
23#endif
24
25// OMPTODO: different style of comments (see kmp_sched)
26// OMPTODO: OMPT/OMPD
27
28// avoid inadevertently using a library based abs
29template <typename T> T __kmp_abs(const T val) {
30  return (val < 0) ? -val : val;
31}
32kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; }
33kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }
34
35//----------------------------------------------------------------------------
36// Common functions for working with rectangular and non-rectangular loops
37//----------------------------------------------------------------------------
38
39template <typename T> int __kmp_sign(T val) {
40  return (T(0) < val) - (val < T(0));
41}
42
43template <typename T> class CollapseAllocator {
44  typedef T *pT;
45
46private:
47  static const size_t allocaSize = 32; // size limit for stack allocations
48                                       // (8 bytes x 4 nested loops)
49  char stackAlloc[allocaSize];
50  static constexpr size_t maxElemCount = allocaSize / sizeof(T);
51  pT pTAlloc;
52
53public:
54  CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) {
55    if (n > maxElemCount) {
56      pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T)));
57    }
58  }
59  ~CollapseAllocator() {
60    if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) {
61      __kmp_free(pTAlloc);
62    }
63  }
64  T &operator[](int index) { return pTAlloc[index]; }
65  operator const pT() { return pTAlloc; }
66};
67
68//----------Loop canonicalization---------------------------------------------
69
70// For loop nest (any shape):
71// convert != to < or >;
72// switch from using < or > to <= or >=.
73// "bounds" array has to be allocated per thread.
74// All other internal functions will work only with canonicalized loops.
75template <typename T>
76void kmp_canonicalize_one_loop_XX(
77    ident_t *loc,
78    /*in/out*/ bounds_infoXX_template<T> *bounds) {
79
80  if (__kmp_env_consistency_check) {
81    if (bounds->step == 0) {
82      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
83                            loc);
84    }
85  }
86
87  if (bounds->comparison == comparison_t::comp_not_eq) {
88    // We can convert this to < or >, depends on the sign of the step:
89    if (bounds->step > 0) {
90      bounds->comparison = comparison_t::comp_less;
91    } else {
92      bounds->comparison = comparison_t::comp_greater;
93    }
94  }
95
96  if (bounds->comparison == comparison_t::comp_less) {
97    // Note: ub0 can be unsigned. Should be Ok to hit overflow here,
98    // because ub0 + ub1*j should be still positive (otherwise loop was not
99    // well formed)
100    bounds->ub0 -= 1;
101    bounds->comparison = comparison_t::comp_less_or_eq;
102  } else if (bounds->comparison == comparison_t::comp_greater) {
103    bounds->ub0 += 1;
104    bounds->comparison = comparison_t::comp_greater_or_eq;
105  }
106}
107
108// Canonicalize loop nest. original_bounds_nest is an array of length n.
109void kmp_canonicalize_loop_nest(ident_t *loc,
110                                /*in/out*/ bounds_info_t *original_bounds_nest,
111                                kmp_index_t n) {
112
113  for (kmp_index_t ind = 0; ind < n; ++ind) {
114    auto bounds = &(original_bounds_nest[ind]);
115
116    switch (bounds->loop_type) {
117    case loop_type_t::loop_type_int32:
118      kmp_canonicalize_one_loop_XX<kmp_int32>(
119          loc,
120          /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
121      break;
122    case loop_type_t::loop_type_uint32:
123      kmp_canonicalize_one_loop_XX<kmp_uint32>(
124          loc,
125          /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
126      break;
127    case loop_type_t::loop_type_int64:
128      kmp_canonicalize_one_loop_XX<kmp_int64>(
129          loc,
130          /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
131      break;
132    case loop_type_t::loop_type_uint64:
133      kmp_canonicalize_one_loop_XX<kmp_uint64>(
134          loc,
135          /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
136      break;
137    default:
138      KMP_ASSERT(false);
139    }
140  }
141}
142
143//----------Calculating trip count on one level-------------------------------
144
145// Calculate trip count on this loop level.
146// We do this either for a rectangular loop nest,
147// or after an adjustment bringing the loops to a parallelepiped shape.
148// This number should not depend on the value of outer IV
149// even if the formular has lb1 and ub1.
150// Note: for non-rectangular loops don't use span for this, it's too big.
151
152template <typename T>
153kmp_loop_nest_iv_t kmp_calculate_trip_count_XX(
154    /*in/out*/ bounds_infoXX_template<T> *bounds) {
155
156  if (bounds->comparison == comparison_t::comp_less_or_eq) {
157    if (bounds->ub0 < bounds->lb0) {
158      // Note: after this we don't need to calculate inner loops,
159      // but that should be an edge case:
160      bounds->trip_count = 0;
161    } else {
162      // ub - lb may exceed signed type range; we need to cast to
163      // kmp_loop_nest_iv_t anyway
164      bounds->trip_count =
165          static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) /
166              __kmp_abs(bounds->step) +
167          1;
168    }
169  } else if (bounds->comparison == comparison_t::comp_greater_or_eq) {
170    if (bounds->lb0 < bounds->ub0) {
171      // Note: after this we don't need to calculate inner loops,
172      // but that should be an edge case:
173      bounds->trip_count = 0;
174    } else {
175      // lb - ub may exceed signed type range; we need to cast to
176      // kmp_loop_nest_iv_t anyway
177      bounds->trip_count =
178          static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) /
179              __kmp_abs(bounds->step) +
180          1;
181    }
182  } else {
183    KMP_ASSERT(false);
184  }
185  return bounds->trip_count;
186}
187
188// Calculate trip count on this loop level.
189kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) {
190
191  kmp_loop_nest_iv_t trip_count = 0;
192
193  switch (bounds->loop_type) {
194  case loop_type_t::loop_type_int32:
195    trip_count = kmp_calculate_trip_count_XX<kmp_int32>(
196        /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
197    break;
198  case loop_type_t::loop_type_uint32:
199    trip_count = kmp_calculate_trip_count_XX<kmp_uint32>(
200        /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
201    break;
202  case loop_type_t::loop_type_int64:
203    trip_count = kmp_calculate_trip_count_XX<kmp_int64>(
204        /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
205    break;
206  case loop_type_t::loop_type_uint64:
207    trip_count = kmp_calculate_trip_count_XX<kmp_uint64>(
208        /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
209    break;
210  default:
211    KMP_ASSERT(false);
212  }
213
214  return trip_count;
215}
216
217//----------Trim original iv according to its type----------------------------
218
219// Trim original iv according to its type.
220// Return kmp_uint64 value which can be easily used in all internal calculations
221// And can be statically cast back to original type in user code.
222kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) {
223  kmp_uint64 res = 0;
224
225  switch (loop_iv_type) {
226  case loop_type_t::loop_type_int8:
227    res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv));
228    break;
229  case loop_type_t::loop_type_uint8:
230    res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv));
231    break;
232  case loop_type_t::loop_type_int16:
233    res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv));
234    break;
235  case loop_type_t::loop_type_uint16:
236    res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv));
237    break;
238  case loop_type_t::loop_type_int32:
239    res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv));
240    break;
241  case loop_type_t::loop_type_uint32:
242    res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv));
243    break;
244  case loop_type_t::loop_type_int64:
245    res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv));
246    break;
247  case loop_type_t::loop_type_uint64:
248    res = static_cast<kmp_uint64>(original_iv);
249    break;
250  default:
251    KMP_ASSERT(false);
252  }
253
254  return res;
255}
256
257//----------Compare two IVs (remember they have a type)-----------------------
258
259bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1,
260                kmp_uint64 original_iv2) {
261  bool res = false;
262
263  switch (loop_iv_type) {
264  case loop_type_t::loop_type_int8:
265    res = static_cast<kmp_int8>(original_iv1) ==
266          static_cast<kmp_int8>(original_iv2);
267    break;
268  case loop_type_t::loop_type_uint8:
269    res = static_cast<kmp_uint8>(original_iv1) ==
270          static_cast<kmp_uint8>(original_iv2);
271    break;
272  case loop_type_t::loop_type_int16:
273    res = static_cast<kmp_int16>(original_iv1) ==
274          static_cast<kmp_int16>(original_iv2);
275    break;
276  case loop_type_t::loop_type_uint16:
277    res = static_cast<kmp_uint16>(original_iv1) ==
278          static_cast<kmp_uint16>(original_iv2);
279    break;
280  case loop_type_t::loop_type_int32:
281    res = static_cast<kmp_int32>(original_iv1) ==
282          static_cast<kmp_int32>(original_iv2);
283    break;
284  case loop_type_t::loop_type_uint32:
285    res = static_cast<kmp_uint32>(original_iv1) ==
286          static_cast<kmp_uint32>(original_iv2);
287    break;
288  case loop_type_t::loop_type_int64:
289    res = static_cast<kmp_int64>(original_iv1) ==
290          static_cast<kmp_int64>(original_iv2);
291    break;
292  case loop_type_t::loop_type_uint64:
293    res = static_cast<kmp_uint64>(original_iv1) ==
294          static_cast<kmp_uint64>(original_iv2);
295    break;
296  default:
297    KMP_ASSERT(false);
298  }
299
300  return res;
301}
302
303//----------Calculate original iv on one level--------------------------------
304
305// Return true if the point fits into upper bounds on this level,
306// false otherwise
307template <typename T>
308bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds,
309                                 const kmp_point_t original_ivs,
310                                 kmp_index_t ind) {
311
312  T iv = static_cast<T>(original_ivs[ind]);
313  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
314
315  if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
316       (iv > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
317      ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
318       (iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
319    // The calculated point is outside of loop upper boundary:
320    return false;
321  }
322
323  return true;
324}
325
326// Calculate one iv corresponding to iteration on the level ind.
327// Return true if it fits into lower-upper bounds on this level
328// (if not, we need to re-calculate)
329template <typename T>
330bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds,
331                        /*in/out*/ kmp_point_t original_ivs,
332                        const kmp_iterations_t iterations, kmp_index_t ind,
333                        bool start_with_lower_bound, bool checkBounds) {
334
335  kmp_uint64 temp = 0;
336  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
337
338  if (start_with_lower_bound) {
339    // we moved to the next iteration on one of outer loops, should start
340    // with the lower bound here:
341    temp = bounds->lb0 + bounds->lb1 * outer_iv;
342  } else {
343    auto iteration = iterations[ind];
344    temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step;
345  }
346
347  // Now trim original iv according to its type:
348  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
349
350  if (checkBounds) {
351    return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind);
352  } else {
353    return true;
354  }
355}
356
357bool kmp_calc_one_iv(const bounds_info_t *bounds,
358                     /*in/out*/ kmp_point_t original_ivs,
359                     const kmp_iterations_t iterations, kmp_index_t ind,
360                     bool start_with_lower_bound, bool checkBounds) {
361
362  switch (bounds->loop_type) {
363  case loop_type_t::loop_type_int32:
364    return kmp_calc_one_iv_XX<kmp_int32>(
365        (bounds_infoXX_template<kmp_int32> *)(bounds),
366        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
367        checkBounds);
368    break;
369  case loop_type_t::loop_type_uint32:
370    return kmp_calc_one_iv_XX<kmp_uint32>(
371        (bounds_infoXX_template<kmp_uint32> *)(bounds),
372        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
373        checkBounds);
374    break;
375  case loop_type_t::loop_type_int64:
376    return kmp_calc_one_iv_XX<kmp_int64>(
377        (bounds_infoXX_template<kmp_int64> *)(bounds),
378        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
379        checkBounds);
380    break;
381  case loop_type_t::loop_type_uint64:
382    return kmp_calc_one_iv_XX<kmp_uint64>(
383        (bounds_infoXX_template<kmp_uint64> *)(bounds),
384        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
385        checkBounds);
386    break;
387  default:
388    KMP_ASSERT(false);
389    return false;
390  }
391}
392
393//----------Calculate original iv on one level for rectangular loop nest------
394
395// Calculate one iv corresponding to iteration on the level ind.
396// Return true if it fits into lower-upper bounds on this level
397// (if not, we need to re-calculate)
398template <typename T>
399void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds,
400                                /*in/out*/ kmp_uint64 *original_ivs,
401                                const kmp_iterations_t iterations,
402                                kmp_index_t ind) {
403
404  auto iteration = iterations[ind];
405
406  kmp_uint64 temp =
407      bounds->lb0 +
408      bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) +
409      iteration * bounds->step;
410
411  // Now trim original iv according to its type:
412  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
413}
414
415void kmp_calc_one_iv_rectang(const bounds_info_t *bounds,
416                             /*in/out*/ kmp_uint64 *original_ivs,
417                             const kmp_iterations_t iterations,
418                             kmp_index_t ind) {
419
420  switch (bounds->loop_type) {
421  case loop_type_t::loop_type_int32:
422    kmp_calc_one_iv_rectang_XX<kmp_int32>(
423        (bounds_infoXX_template<kmp_int32> *)(bounds),
424        /*in/out*/ original_ivs, iterations, ind);
425    break;
426  case loop_type_t::loop_type_uint32:
427    kmp_calc_one_iv_rectang_XX<kmp_uint32>(
428        (bounds_infoXX_template<kmp_uint32> *)(bounds),
429        /*in/out*/ original_ivs, iterations, ind);
430    break;
431  case loop_type_t::loop_type_int64:
432    kmp_calc_one_iv_rectang_XX<kmp_int64>(
433        (bounds_infoXX_template<kmp_int64> *)(bounds),
434        /*in/out*/ original_ivs, iterations, ind);
435    break;
436  case loop_type_t::loop_type_uint64:
437    kmp_calc_one_iv_rectang_XX<kmp_uint64>(
438        (bounds_infoXX_template<kmp_uint64> *)(bounds),
439        /*in/out*/ original_ivs, iterations, ind);
440    break;
441  default:
442    KMP_ASSERT(false);
443  }
444}
445
446//----------------------------------------------------------------------------
447// Rectangular loop nest
448//----------------------------------------------------------------------------
449
450//----------Canonicalize loop nest and calculate trip count-------------------
451
452// Canonicalize loop nest and calculate overall trip count.
453// "bounds_nest" has to be allocated per thread.
454// API will modify original bounds_nest array to bring it to a canonical form
455// (only <= and >=, no !=, <, >). If the original loop nest was already in a
456// canonical form there will be no changes to bounds in bounds_nest array
457// (only trip counts will be calculated).
458// Returns trip count of overall space.
459extern "C" kmp_loop_nest_iv_t
460__kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
461                                 /*in/out*/ bounds_info_t *original_bounds_nest,
462                                 kmp_index_t n) {
463
464  kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
465
466  kmp_loop_nest_iv_t total = 1;
467
468  for (kmp_index_t ind = 0; ind < n; ++ind) {
469    auto bounds = &(original_bounds_nest[ind]);
470
471    kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds);
472    total *= trip_count;
473  }
474
475  return total;
476}
477
478//----------Calculate old induction variables---------------------------------
479
480// Calculate old induction variables corresponding to overall new_iv.
481// Note: original IV will be returned as if it had kmp_uint64 type,
482// will have to be converted to original type in user code.
483// Note: trip counts should be already calculated by
484// __kmpc_process_loop_nest_rectang.
485// OMPTODO: special case 2, 3 nested loops: either do different
486// interface without array or possibly template this over n
487extern "C" void
488__kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
489                                 const bounds_info_t *original_bounds_nest,
490                                 /*out*/ kmp_uint64 *original_ivs,
491                                 kmp_index_t n) {
492
493  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
494
495  // First, calc corresponding iteration in every original loop:
496  for (kmp_index_t ind = n; ind > 0;) {
497    --ind;
498    auto bounds = &(original_bounds_nest[ind]);
499
500    // should be optimized to OPDIVREM:
501    auto temp = new_iv / bounds->trip_count;
502    auto iteration = new_iv % bounds->trip_count;
503    new_iv = temp;
504
505    iterations[ind] = iteration;
506  }
507  KMP_ASSERT(new_iv == 0);
508
509  for (kmp_index_t ind = 0; ind < n; ++ind) {
510    auto bounds = &(original_bounds_nest[ind]);
511
512    kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind);
513  }
514}
515
516//----------------------------------------------------------------------------
517// Non-rectangular loop nest
518//----------------------------------------------------------------------------
519
520//----------Calculate maximum possible span of iv values on one level---------
521
522// Calculate span for IV on this loop level for "<=" case.
523// Note: it's for <= on this loop nest level, so lower bound should be smallest
524// value, upper bound should be the biggest value. If the loop won't execute,
525// 'smallest' may be bigger than 'biggest', but we'd better not switch them
526// around.
527template <typename T>
528void kmp_calc_span_lessoreq_XX(
529    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
530    /* in/out*/ bounds_info_internal_t *bounds_nest) {
531
532  typedef typename traits_t<T>::unsigned_t UT;
533  // typedef typename traits_t<T>::signed_t ST;
534
535  // typedef typename big_span_t span_t;
536  typedef T span_t;
537
538  auto &bbounds = bounds->b;
539
540  if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
541    // This dimention depends on one of previous ones; can't be the outermost
542    // one.
543    bounds_info_internalXX_template<T> *previous =
544        reinterpret_cast<bounds_info_internalXX_template<T> *>(
545            &(bounds_nest[bbounds.outer_iv]));
546
547    // OMPTODO: assert that T is compatible with loop variable type on
548    // 'previous' loop
549
550    {
551      span_t bound_candidate1 =
552          bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
553      span_t bound_candidate2 =
554          bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
555      if (bound_candidate1 < bound_candidate2) {
556        bounds->span_smallest = bound_candidate1;
557      } else {
558        bounds->span_smallest = bound_candidate2;
559      }
560    }
561
562    {
563      // We can't adjust the upper bound with respect to step, because
564      // lower bound might be off after adjustments
565
566      span_t bound_candidate1 =
567          bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
568      span_t bound_candidate2 =
569          bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
570      if (bound_candidate1 < bound_candidate2) {
571        bounds->span_biggest = bound_candidate2;
572      } else {
573        bounds->span_biggest = bound_candidate1;
574      }
575    }
576  } else {
577    // Rectangular:
578    bounds->span_smallest = bbounds.lb0;
579    bounds->span_biggest = bbounds.ub0;
580  }
581  if (!bounds->loop_bounds_adjusted) {
582    // Here it's safe to reduce the space to the multiply of step.
583    // OMPTODO: check if the formular is correct.
584    // Also check if it would be safe to do this if we didn't adjust left side.
585    bounds->span_biggest -=
586        (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
587  }
588}
589
590// Calculate span for IV on this loop level for ">=" case.
591template <typename T>
592void kmp_calc_span_greateroreq_XX(
593    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
594    /* in/out*/ bounds_info_internal_t *bounds_nest) {
595
596  typedef typename traits_t<T>::unsigned_t UT;
597  // typedef typename traits_t<T>::signed_t ST;
598
599  // typedef typename big_span_t span_t;
600  typedef T span_t;
601
602  auto &bbounds = bounds->b;
603
604  if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
605    // This dimention depends on one of previous ones; can't be the outermost
606    // one.
607    bounds_info_internalXX_template<T> *previous =
608        reinterpret_cast<bounds_info_internalXX_template<T> *>(
609            &(bounds_nest[bbounds.outer_iv]));
610
611    // OMPTODO: assert that T is compatible with loop variable type on
612    // 'previous' loop
613
614    {
615      span_t bound_candidate1 =
616          bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
617      span_t bound_candidate2 =
618          bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
619      if (bound_candidate1 >= bound_candidate2) {
620        bounds->span_smallest = bound_candidate1;
621      } else {
622        bounds->span_smallest = bound_candidate2;
623      }
624    }
625
626    {
627      // We can't adjust the upper bound with respect to step, because
628      // lower bound might be off after adjustments
629
630      span_t bound_candidate1 =
631          bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
632      span_t bound_candidate2 =
633          bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
634      if (bound_candidate1 >= bound_candidate2) {
635        bounds->span_biggest = bound_candidate2;
636      } else {
637        bounds->span_biggest = bound_candidate1;
638      }
639    }
640
641  } else {
642    // Rectangular:
643    bounds->span_biggest = bbounds.lb0;
644    bounds->span_smallest = bbounds.ub0;
645  }
646  if (!bounds->loop_bounds_adjusted) {
647    // Here it's safe to reduce the space to the multiply of step.
648    // OMPTODO: check if the formular is correct.
649    // Also check if it would be safe to do this if we didn't adjust left side.
650    bounds->span_biggest -=
651        (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
652  }
653}
654
655// Calculate maximum possible span for IV on this loop level.
656template <typename T>
657void kmp_calc_span_XX(
658    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
659    /* in/out*/ bounds_info_internal_t *bounds_nest) {
660
661  if (bounds->b.comparison == comparison_t::comp_less_or_eq) {
662    kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
663  } else {
664    KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq);
665    kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
666  }
667}
668
669//----------All initial processing of the loop nest---------------------------
670
671// Calculate new bounds for this loop level.
672// To be able to work with the nest we need to get it to a parallelepiped shape.
673// We need to stay in the original range of values, so that there will be no
674// overflow, for that we'll adjust both upper and lower bounds as needed.
675template <typename T>
676void kmp_calc_new_bounds_XX(
677    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
678    /* in/out*/ bounds_info_internal_t *bounds_nest) {
679
680  auto &bbounds = bounds->b;
681
682  if (bbounds.lb1 == bbounds.ub1) {
683    // Already parallel, no need to adjust:
684    bounds->loop_bounds_adjusted = false;
685  } else {
686    bounds->loop_bounds_adjusted = true;
687
688    T old_lb1 = bbounds.lb1;
689    T old_ub1 = bbounds.ub1;
690
691    if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) {
692      // With this shape we can adjust to a rectangle:
693      bbounds.lb1 = 0;
694      bbounds.ub1 = 0;
695    } else {
696      // get upper and lower bounds to be parallel
697      // with values in the old range.
698      // Note: abs didn't work here.
699      if (((old_lb1 < 0) && (old_lb1 < old_ub1)) ||
700          ((old_lb1 > 0) && (old_lb1 > old_ub1))) {
701        bbounds.lb1 = old_ub1;
702      } else {
703        bbounds.ub1 = old_lb1;
704      }
705    }
706
707    // Now need to adjust lb0, ub0, otherwise in some cases space will shrink.
708    // The idea here that for this IV we are now getting the same span
709    // irrespective of the previous IV value.
710    bounds_info_internalXX_template<T> *previous =
711        reinterpret_cast<bounds_info_internalXX_template<T> *>(
712            &bounds_nest[bbounds.outer_iv]);
713
714    if (bbounds.comparison == comparison_t::comp_less_or_eq) {
715      if (old_lb1 < bbounds.lb1) {
716        KMP_ASSERT(old_lb1 < 0);
717        // The length is good on outer_iv biggest number,
718        // can use it to find where to move the lower bound:
719
720        T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest;
721        bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space?
722                            // e.g. it was 0?? (same below)
723      } else if (old_lb1 > bbounds.lb1) {
724        // still need to move lower bound:
725        T add = (old_lb1 - bbounds.lb1) * previous->span_smallest;
726        bbounds.lb0 += add;
727      }
728
729      if (old_ub1 > bbounds.ub1) {
730        KMP_ASSERT(old_ub1 > 0);
731        // The length is good on outer_iv biggest number,
732        // can use it to find where to move upper bound:
733
734        T add = (old_ub1 - bbounds.ub1) * previous->span_biggest;
735        bbounds.ub0 += add;
736      } else if (old_ub1 < bbounds.ub1) {
737        // still need to move upper bound:
738        T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest;
739        bbounds.ub0 -= sub;
740      }
741    } else {
742      KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq);
743      if (old_lb1 < bbounds.lb1) {
744        KMP_ASSERT(old_lb1 < 0);
745        T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest;
746        bbounds.lb0 -= sub;
747      } else if (old_lb1 > bbounds.lb1) {
748        T add = (old_lb1 - bbounds.lb1) * previous->span_biggest;
749        bbounds.lb0 += add;
750      }
751
752      if (old_ub1 > bbounds.ub1) {
753        KMP_ASSERT(old_ub1 > 0);
754        T add = (old_ub1 - bbounds.ub1) * previous->span_smallest;
755        bbounds.ub0 += add;
756      } else if (old_ub1 < bbounds.ub1) {
757        T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest;
758        bbounds.ub0 -= sub;
759      }
760    }
761  }
762}
763
764// Do all processing for one canonicalized loop in the nest
765// (assuming that outer loops already were processed):
766template <typename T>
767kmp_loop_nest_iv_t kmp_process_one_loop_XX(
768    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
769    /*in/out*/ bounds_info_internal_t *bounds_nest) {
770
771  kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
772  kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
773  return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b));
774}
775
776// Non-rectangular loop nest, canonicalized to use <= or >=.
777// Process loop nest to have a parallelepiped shape,
778// calculate biggest spans for IV's on all levels and calculate overall trip
779// count. "bounds_nest" has to be allocated per thread.
780// Returns overall trip count (for adjusted space).
781kmp_loop_nest_iv_t kmp_process_loop_nest(
782    /*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) {
783
784  kmp_loop_nest_iv_t total = 1;
785
786  for (kmp_index_t ind = 0; ind < n; ++ind) {
787    auto bounds = &(bounds_nest[ind]);
788    kmp_loop_nest_iv_t trip_count = 0;
789
790    switch (bounds->b.loop_type) {
791    case loop_type_t::loop_type_int32:
792      trip_count = kmp_process_one_loop_XX<kmp_int32>(
793          /*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds),
794          /*in/out*/ bounds_nest);
795      break;
796    case loop_type_t::loop_type_uint32:
797      trip_count = kmp_process_one_loop_XX<kmp_uint32>(
798          /*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds),
799          /*in/out*/ bounds_nest);
800      break;
801    case loop_type_t::loop_type_int64:
802      trip_count = kmp_process_one_loop_XX<kmp_int64>(
803          /*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds),
804          /*in/out*/ bounds_nest);
805      break;
806    case loop_type_t::loop_type_uint64:
807      trip_count = kmp_process_one_loop_XX<kmp_uint64>(
808          /*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds),
809          /*in/out*/ bounds_nest);
810      break;
811    default:
812      KMP_ASSERT(false);
813    }
814    total *= trip_count;
815  }
816
817  return total;
818}
819
820//----------Calculate iterations (in the original or updated space)-----------
821
822// Calculate number of iterations in original or updated space resulting in
823// original_ivs[ind] (only on this level, non-negative)
824// (not counting initial iteration)
825template <typename T>
826kmp_loop_nest_iv_t
827kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds,
828                                 const kmp_point_t original_ivs,
829                                 kmp_index_t ind) {
830
831  kmp_loop_nest_iv_t iterations = 0;
832
833  if (bounds->comparison == comparison_t::comp_less_or_eq) {
834    iterations =
835        (static_cast<T>(original_ivs[ind]) - bounds->lb0 -
836         bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) /
837        __kmp_abs(bounds->step);
838  } else {
839    KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq);
840    iterations = (bounds->lb0 +
841                  bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) -
842                  static_cast<T>(original_ivs[ind])) /
843                 __kmp_abs(bounds->step);
844  }
845
846  return iterations;
847}
848
849// Calculate number of iterations in the original or updated space resulting in
850// original_ivs[ind] (only on this level, non-negative)
851kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds,
852                                                 const kmp_point_t original_ivs,
853                                                 kmp_index_t ind) {
854
855  switch (bounds->loop_type) {
856  case loop_type_t::loop_type_int32:
857    return kmp_calc_number_of_iterations_XX<kmp_int32>(
858        (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind);
859    break;
860  case loop_type_t::loop_type_uint32:
861    return kmp_calc_number_of_iterations_XX<kmp_uint32>(
862        (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind);
863    break;
864  case loop_type_t::loop_type_int64:
865    return kmp_calc_number_of_iterations_XX<kmp_int64>(
866        (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind);
867    break;
868  case loop_type_t::loop_type_uint64:
869    return kmp_calc_number_of_iterations_XX<kmp_uint64>(
870        (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind);
871    break;
872  default:
873    KMP_ASSERT(false);
874    return 0;
875  }
876}
877
878//----------Calculate new iv corresponding to original ivs--------------------
879
880// We got a point in the original loop nest.
881// Take updated bounds and calculate what new_iv will correspond to this point.
882// When we are getting original IVs from new_iv, we have to adjust to fit into
883// original loops bounds. Getting new_iv for the adjusted original IVs will help
884// with making more chunks non-empty.
885kmp_loop_nest_iv_t
886kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest,
887                                  const kmp_point_t original_ivs,
888                                  kmp_index_t n) {
889
890  kmp_loop_nest_iv_t new_iv = 0;
891
892  for (kmp_index_t ind = 0; ind < n; ++ind) {
893    auto bounds = &(bounds_nest[ind].b);
894
895    new_iv = new_iv * bounds->trip_count +
896             kmp_calc_number_of_iterations(bounds, original_ivs, ind);
897  }
898
899  return new_iv;
900}
901
902//----------Calculate original ivs for provided iterations--------------------
903
904// Calculate original IVs for provided iterations, assuming iterations are
905// calculated in the original space.
906// Loop nest is in canonical form (with <= / >=).
907bool kmp_calc_original_ivs_from_iterations(
908    const bounds_info_t *original_bounds_nest, kmp_index_t n,
909    /*in/out*/ kmp_point_t original_ivs,
910    /*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) {
911
912  kmp_index_t lengthened_ind = n;
913
914  for (; ind < n;) {
915    auto bounds = &(original_bounds_nest[ind]);
916    bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations,
917                                ind, (lengthened_ind < ind), true);
918
919    if (!good) {
920      // The calculated iv value is too big (or too small for >=):
921      if (ind == 0) {
922        // Space is empty:
923        return false;
924      } else {
925        // Go to next iteration on the outer loop:
926        --ind;
927        ++iterations[ind];
928        lengthened_ind = ind;
929        for (kmp_index_t i = ind + 1; i < n; ++i) {
930          iterations[i] = 0;
931        }
932        continue;
933      }
934    }
935    ++ind;
936  }
937
938  return true;
939}
940
941//----------Calculate original ivs for the beginning of the loop nest---------
942
943// Calculate IVs for the beginning of the loop nest.
944// Note: lower bounds of all loops may not work -
945// if on some of the iterations of the outer loops inner loops are empty.
946// Loop nest is in canonical form (with <= / >=).
947bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
948                                     kmp_index_t n,
949                                     /*out*/ kmp_point_t original_ivs) {
950
951  // Iterations in the original space, multiplied by step:
952  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
953  for (kmp_index_t ind = n; ind > 0;) {
954    --ind;
955    iterations[ind] = 0;
956  }
957
958  // Now calculate the point:
959  bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,
960                                                 /*in/out*/ original_ivs,
961                                                 /*in/out*/ iterations, 0);
962  return b;
963}
964
965//----------Calculate next point in the original loop space-------------------
966
967// From current set of original IVs calculate next point.
968// Return false if there is no next point in the loop bounds.
969bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
970                                kmp_index_t n, const kmp_point_t original_ivs,
971                                /*out*/ kmp_point_t next_original_ivs) {
972  // Iterations in the original space, multiplied by step (so can be negative):
973  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
974  // First, calc corresponding iteration in every original loop:
975  for (kmp_index_t ind = 0; ind < n; ++ind) {
976    auto bounds = &(original_bounds_nest[ind]);
977    iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind);
978  }
979
980  for (kmp_index_t ind = 0; ind < n; ++ind) {
981    next_original_ivs[ind] = original_ivs[ind];
982  }
983
984  // Next add one step to the iterations on the inner-most level, and see if we
985  // need to move up the nest:
986  kmp_index_t ind = n - 1;
987  ++iterations[ind];
988
989  bool b = kmp_calc_original_ivs_from_iterations(
990      original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind);
991
992  return b;
993}
994
995//----------Calculate chunk end in the original loop space--------------------
996
997// For one level calculate old induction variable corresponding to overall
998// new_iv for the chunk end.
999// Return true if it fits into upper bound on this level
1000// (if not, we need to re-calculate)
1001template <typename T>
1002bool kmp_calc_one_iv_for_chunk_end_XX(
1003    const bounds_infoXX_template<T> *bounds,
1004    const bounds_infoXX_template<T> *updated_bounds,
1005    /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations,
1006    kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start,
1007    const kmp_point_t original_ivs_start) {
1008
1009  // typedef  std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
1010  // big_span_t;
1011
1012  // OMPTODO: is it good enough, or do we need ST or do we need big_span_t?
1013  T temp = 0;
1014
1015  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
1016
1017  if (start_with_lower_bound) {
1018    // we moved to the next iteration on one of outer loops, may as well use
1019    // the lower bound here:
1020    temp = bounds->lb0 + bounds->lb1 * outer_iv;
1021  } else {
1022    // Start in expanded space, but:
1023    // - we need to hit original space lower bound, so need to account for
1024    // that
1025    // - we have to go into original space, even if that means adding more
1026    // iterations than was planned
1027    // - we have to go past (or equal to) previous point (which is the chunk
1028    // starting point)
1029
1030    auto iteration = iterations[ind];
1031
1032    auto step = bounds->step;
1033
1034    // In case of >= it's negative:
1035    auto accountForStep =
1036        ((bounds->lb0 + bounds->lb1 * outer_iv) -
1037         (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) %
1038        step;
1039
1040    temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv +
1041           accountForStep + iteration * step;
1042
1043    if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1044         (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) ||
1045        ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1046         (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) {
1047      // Too small (or too big), didn't reach the original lower bound. Use
1048      // heuristic:
1049      temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step;
1050    }
1051
1052    if (compare_with_start) {
1053
1054      T start = static_cast<T>(original_ivs_start[ind]);
1055
1056      temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1057
1058      // On all previous levels start of the chunk is same as the end, need to
1059      // be really careful here:
1060      if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1061           (temp < start)) ||
1062          ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1063           (temp > start))) {
1064        // End of the chunk can't be smaller (for >= bigger) than it's start.
1065        // Use heuristic:
1066        temp = start + iteration / 4 * step;
1067      }
1068    }
1069  }
1070
1071  original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1072
1073  if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1074       (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
1075      ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1076       (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
1077    // Too big (or too small for >=).
1078    return false;
1079  }
1080
1081  return true;
1082}
1083
1084// For one level calculate old induction variable corresponding to overall
1085// new_iv for the chunk end.
1086bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds,
1087                                   const bounds_info_t *updated_bounds,
1088                                   /*in/out*/ kmp_point_t original_ivs,
1089                                   const kmp_iterations_t iterations,
1090                                   kmp_index_t ind, bool start_with_lower_bound,
1091                                   bool compare_with_start,
1092                                   const kmp_point_t original_ivs_start) {
1093
1094  switch (bounds->loop_type) {
1095  case loop_type_t::loop_type_int32:
1096    return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>(
1097        (bounds_infoXX_template<kmp_int32> *)(bounds),
1098        (bounds_infoXX_template<kmp_int32> *)(updated_bounds),
1099        /*in/out*/
1100        original_ivs, iterations, ind, start_with_lower_bound,
1101        compare_with_start, original_ivs_start);
1102    break;
1103  case loop_type_t::loop_type_uint32:
1104    return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>(
1105        (bounds_infoXX_template<kmp_uint32> *)(bounds),
1106        (bounds_infoXX_template<kmp_uint32> *)(updated_bounds),
1107        /*in/out*/
1108        original_ivs, iterations, ind, start_with_lower_bound,
1109        compare_with_start, original_ivs_start);
1110    break;
1111  case loop_type_t::loop_type_int64:
1112    return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>(
1113        (bounds_infoXX_template<kmp_int64> *)(bounds),
1114        (bounds_infoXX_template<kmp_int64> *)(updated_bounds),
1115        /*in/out*/
1116        original_ivs, iterations, ind, start_with_lower_bound,
1117        compare_with_start, original_ivs_start);
1118    break;
1119  case loop_type_t::loop_type_uint64:
1120    return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>(
1121        (bounds_infoXX_template<kmp_uint64> *)(bounds),
1122        (bounds_infoXX_template<kmp_uint64> *)(updated_bounds),
1123        /*in/out*/
1124        original_ivs, iterations, ind, start_with_lower_bound,
1125        compare_with_start, original_ivs_start);
1126    break;
1127  default:
1128    KMP_ASSERT(false);
1129    return false;
1130  }
1131}
1132
1133// Calculate old induction variables corresponding to overall new_iv for the
1134// chunk end. If due to space extension we are getting old IVs outside of the
1135// boundaries, bring them into the boundaries. Need to do this in the runtime,
1136// esp. on the lower bounds side. When getting result need to make sure that the
1137// new chunk starts at next position to old chunk, not overlaps with it (this is
1138// done elsewhere), and need to make sure end of the chunk is further than the
1139// beginning of the chunk. We don't need an exact ending point here, just
1140// something more-or-less close to the desired chunk length, bigger is fine
1141// (smaller would be fine, but we risk going into infinite loop, so do smaller
1142// only at the very end of the space). result: false if could not find the
1143// ending point in the original loop space. In this case the caller can use
1144// original upper bounds as the end of the chunk. Chunk won't be empty, because
1145// it'll have at least the starting point, which is by construction in the
1146// original space.
1147bool kmp_calc_original_ivs_for_chunk_end(
1148    const bounds_info_t *original_bounds_nest, kmp_index_t n,
1149    const bounds_info_internal_t *updated_bounds_nest,
1150    const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv,
1151    /*out*/ kmp_point_t original_ivs) {
1152
1153  // Iterations in the expanded space:
1154  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
1155  // First, calc corresponding iteration in every modified loop:
1156  for (kmp_index_t ind = n; ind > 0;) {
1157    --ind;
1158    auto &updated_bounds = updated_bounds_nest[ind];
1159
1160    // should be optimized to OPDIVREM:
1161    auto new_ind = new_iv / updated_bounds.b.trip_count;
1162    auto iteration = new_iv % updated_bounds.b.trip_count;
1163
1164    new_iv = new_ind;
1165    iterations[ind] = iteration;
1166  }
1167  KMP_DEBUG_ASSERT(new_iv == 0);
1168
1169  kmp_index_t lengthened_ind = n;
1170  kmp_index_t equal_ind = -1;
1171
1172  // Next calculate the point, but in original loop nest.
1173  for (kmp_index_t ind = 0; ind < n;) {
1174    auto bounds = &(original_bounds_nest[ind]);
1175    auto updated_bounds = &(updated_bounds_nest[ind].b);
1176
1177    bool good = kmp_calc_one_iv_for_chunk_end(
1178        bounds, updated_bounds,
1179        /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind),
1180        (equal_ind >= ind - 1), original_ivs_start);
1181
1182    if (!good) {
1183      // Too big (or too small for >=).
1184      if (ind == 0) {
1185        // Need to reduce to the end.
1186        return false;
1187      } else {
1188        // Go to next iteration on outer loop:
1189        --ind;
1190        ++(iterations[ind]);
1191        lengthened_ind = ind;
1192        if (equal_ind >= lengthened_ind) {
1193          // We've changed the number of iterations here,
1194          // can't be same anymore:
1195          equal_ind = lengthened_ind - 1;
1196        }
1197        for (kmp_index_t i = ind + 1; i < n; ++i) {
1198          iterations[i] = 0;
1199        }
1200        continue;
1201      }
1202    }
1203
1204    if ((equal_ind == ind - 1) &&
1205        (kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1206                    original_ivs_start[ind]))) {
1207      equal_ind = ind;
1208    } else if ((equal_ind > ind - 1) &&
1209               !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1210                            original_ivs_start[ind]))) {
1211      equal_ind = ind - 1;
1212    }
1213    ++ind;
1214  }
1215
1216  return true;
1217}
1218
1219//----------Calculate upper bounds for the last chunk-------------------------
1220
1221// Calculate one upper bound for the end.
1222template <typename T>
1223void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds,
1224                            /*in/out*/ kmp_point_t original_ivs,
1225                            kmp_index_t ind) {
1226
1227  T temp = bounds->ub0 +
1228           bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]);
1229
1230  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
1231}
1232
1233void kmp_calc_one_iv_end(const bounds_info_t *bounds,
1234                         /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) {
1235
1236  switch (bounds->loop_type) {
1237  default:
1238    KMP_ASSERT(false);
1239    break;
1240  case loop_type_t::loop_type_int32:
1241    kmp_calc_one_iv_end_XX<kmp_int32>(
1242        (bounds_infoXX_template<kmp_int32> *)(bounds),
1243        /*in/out*/ original_ivs, ind);
1244    break;
1245  case loop_type_t::loop_type_uint32:
1246    kmp_calc_one_iv_end_XX<kmp_uint32>(
1247        (bounds_infoXX_template<kmp_uint32> *)(bounds),
1248        /*in/out*/ original_ivs, ind);
1249    break;
1250  case loop_type_t::loop_type_int64:
1251    kmp_calc_one_iv_end_XX<kmp_int64>(
1252        (bounds_infoXX_template<kmp_int64> *)(bounds),
1253        /*in/out*/ original_ivs, ind);
1254    break;
1255  case loop_type_t::loop_type_uint64:
1256    kmp_calc_one_iv_end_XX<kmp_uint64>(
1257        (bounds_infoXX_template<kmp_uint64> *)(bounds),
1258        /*in/out*/ original_ivs, ind);
1259    break;
1260  }
1261}
1262
1263// Calculate upper bounds for the last loop iteration. Just use original upper
1264// bounds (adjusted when canonicalized to use <= / >=). No need to check that
1265// this point is in the original space (it's likely not)
1266void kmp_calc_original_ivs_for_end(
1267    const bounds_info_t *const original_bounds_nest, kmp_index_t n,
1268    /*out*/ kmp_point_t original_ivs) {
1269  for (kmp_index_t ind = 0; ind < n; ++ind) {
1270    auto bounds = &(original_bounds_nest[ind]);
1271    kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind);
1272  }
1273}
1274
1275//----------Init API for non-rectangular loops--------------------------------
1276
1277// Init API for collapsed loops (static, no chunks defined).
1278// "bounds_nest" has to be allocated per thread.
1279// API will modify original bounds_nest array to bring it to a canonical form
1280// (only <= and >=, no !=, <, >). If the original loop nest was already in a
1281// canonical form there will be no changes to bounds in bounds_nest array
1282// (only trip counts will be calculated). Internally API will expand the space
1283// to parallelogram/parallelepiped, calculate total, calculate bounds for the
1284// chunks in terms of the new IV, re-calc them in terms of old IVs (especially
1285// important on the left side, to hit the lower bounds and not step over), and
1286// pick the correct chunk for this thread (so it will calculate chunks up to the
1287// needed one). It could be optimized to calculate just this chunk, potentially
1288// a bit less well distributed among threads. It is designed to make sure that
1289// threads will receive predictable chunks, deterministically (so that next nest
1290// of loops with similar characteristics will get exactly same chunks on same
1291// threads).
1292// Current contract: chunk_bounds_nest has only lb0 and ub0,
1293// lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
1294extern "C" kmp_int32
1295__kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
1296                          /*in/out*/ bounds_info_t *original_bounds_nest,
1297                          /*out*/ bounds_info_t *chunk_bounds_nest,
1298                          kmp_index_t n, /*out*/ kmp_int32 *plastiter) {
1299
1300  KMP_DEBUG_ASSERT(plastiter && original_bounds_nest);
1301  KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid));
1302
1303  if (__kmp_env_consistency_check) {
1304    __kmp_push_workshare(gtid, ct_pdo, loc);
1305  }
1306
1307  kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
1308
1309  CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n);
1310
1311  for (kmp_index_t i = 0; i < n; ++i) {
1312    updated_bounds_nest[i].b = original_bounds_nest[i];
1313  }
1314
1315  kmp_loop_nest_iv_t total =
1316      kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n);
1317
1318  if (plastiter != NULL) {
1319    *plastiter = FALSE;
1320  }
1321
1322  if (total == 0) {
1323    // Loop won't execute:
1324    return FALSE;
1325  }
1326
1327  // OMPTODO: DISTRIBUTE is not supported yet
1328  __kmp_assert_valid_gtid(gtid);
1329  kmp_uint32 tid = __kmp_tid_from_gtid(gtid);
1330
1331  kmp_info_t *th = __kmp_threads[gtid];
1332  kmp_team_t *team = th->th.th_team;
1333  kmp_uint32 nth = team->t.t_nproc; // Number of threads
1334
1335  KMP_DEBUG_ASSERT(tid < nth);
1336
1337  CollapseAllocator<kmp_uint64> original_ivs_start(n);
1338
1339  if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
1340                                       /*out*/ original_ivs_start)) {
1341    // Loop won't execute:
1342    return FALSE;
1343  }
1344
1345  // Not doing this optimization for one thread:
1346  // (1) more to test
1347  // (2) without it current contract that chunk_bounds_nest has only lb0 and
1348  // ub0, lb1 and ub1 are set to 0 and can be ignored.
1349  // if (nth == 1) {
1350  //  // One thread:
1351  //  // Copy all info from original_bounds_nest, it'll be good enough.
1352
1353  //  for (kmp_index_t i = 0; i < n; ++i) {
1354  //    chunk_bounds_nest[i] = original_bounds_nest[i];
1355  //  }
1356
1357  //  if (plastiter != NULL) {
1358  //    *plastiter = TRUE;
1359  //  }
1360  //  return TRUE;
1361  //}
1362
1363  kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs(
1364      updated_bounds_nest, original_ivs_start, n);
1365
1366  bool last_iter = false;
1367
1368  for (; nth > 0;) {
1369    // We could calculate chunk size once, but this is to compensate that the
1370    // original space is not parallelepiped and some threads can be left
1371    // without work:
1372    KMP_DEBUG_ASSERT(total >= new_iv);
1373
1374    kmp_loop_nest_iv_t total_left = total - new_iv;
1375    kmp_loop_nest_iv_t chunk_size = total_left / nth;
1376    kmp_loop_nest_iv_t remainder = total_left % nth;
1377
1378    kmp_loop_nest_iv_t curr_chunk_size = chunk_size;
1379
1380    if (remainder > 0) {
1381      ++curr_chunk_size;
1382      --remainder;
1383    }
1384
1385#if defined(KMP_DEBUG)
1386    kmp_loop_nest_iv_t new_iv_for_start = new_iv;
1387#endif
1388
1389    if (curr_chunk_size > 1) {
1390      new_iv += curr_chunk_size - 1;
1391    }
1392
1393    CollapseAllocator<kmp_uint64> original_ivs_end(n);
1394    if ((nth == 1) || (new_iv >= total - 1)) {
1395      // Do this one till the end - just in case we miscalculated
1396      // and either too much is left to process or new_iv is a bit too big:
1397      kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1398                                    /*out*/ original_ivs_end);
1399
1400      last_iter = true;
1401    } else {
1402      // Note: here we make sure it's past (or equal to) the previous point.
1403      if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n,
1404                                               updated_bounds_nest,
1405                                               original_ivs_start, new_iv,
1406                                               /*out*/ original_ivs_end)) {
1407        // We could not find the ending point, use the original upper bounds:
1408        kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1409                                      /*out*/ original_ivs_end);
1410
1411        last_iter = true;
1412      }
1413    }
1414
1415#if defined(KMP_DEBUG)
1416    auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs(
1417        updated_bounds_nest, original_ivs_end, n);
1418    KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start);
1419#endif
1420
1421    if (last_iter && (tid != 0)) {
1422      // We are done, this was last chunk, but no chunk for current thread was
1423      // found:
1424      return FALSE;
1425    }
1426
1427    if (tid == 0) {
1428      // We found the chunk for this thread, now we need to check if it's the
1429      // last chunk or not:
1430
1431      CollapseAllocator<kmp_uint64> original_ivs_next_start(n);
1432      if (last_iter ||
1433          !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end,
1434                                      /*out*/ original_ivs_next_start)) {
1435        // no more loop iterations left to process,
1436        // this means that currently found chunk is the last chunk:
1437        if (plastiter != NULL) {
1438          *plastiter = TRUE;
1439        }
1440      }
1441
1442      // Fill in chunk bounds:
1443      for (kmp_index_t i = 0; i < n; ++i) {
1444        chunk_bounds_nest[i] =
1445            original_bounds_nest[i]; // To fill in types, etc. - optional
1446        chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i];
1447        chunk_bounds_nest[i].lb1_u64 = 0;
1448
1449        chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i];
1450        chunk_bounds_nest[i].ub1_u64 = 0;
1451      }
1452
1453      return TRUE;
1454    }
1455
1456    --tid;
1457    --nth;
1458
1459    bool next_chunk = kmp_calc_next_original_ivs(
1460        original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start);
1461    if (!next_chunk) {
1462      // no more loop iterations to process,
1463      // the prevoius chunk was the last chunk
1464      break;
1465    }
1466
1467    // original_ivs_start is next to previous chunk original_ivs_end,
1468    // we need to start new chunk here, so chunks will be one after another
1469    // without any gap or overlap:
1470    new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest,
1471                                               original_ivs_start, n);
1472  }
1473
1474  return FALSE;
1475}
1476