xref: /freebsd-src/contrib/llvm-project/openmp/runtime/src/kmp_collapse.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
106c3fb27SDimitry Andric /*
206c3fb27SDimitry Andric  * kmp_collapse.cpp -- loop collapse feature
306c3fb27SDimitry Andric  */
406c3fb27SDimitry Andric 
506c3fb27SDimitry Andric //===----------------------------------------------------------------------===//
606c3fb27SDimitry Andric //
706c3fb27SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
806c3fb27SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
906c3fb27SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
1006c3fb27SDimitry Andric //
1106c3fb27SDimitry Andric //===----------------------------------------------------------------------===//
1206c3fb27SDimitry Andric 
1306c3fb27SDimitry Andric #include "kmp.h"
1406c3fb27SDimitry Andric #include "kmp_error.h"
1506c3fb27SDimitry Andric #include "kmp_i18n.h"
1606c3fb27SDimitry Andric #include "kmp_itt.h"
1706c3fb27SDimitry Andric #include "kmp_stats.h"
1806c3fb27SDimitry Andric #include "kmp_str.h"
1906c3fb27SDimitry Andric #include "kmp_collapse.h"
2006c3fb27SDimitry Andric 
2106c3fb27SDimitry Andric #if OMPT_SUPPORT
2206c3fb27SDimitry Andric #include "ompt-specific.h"
2306c3fb27SDimitry Andric #endif
2406c3fb27SDimitry Andric 
2506c3fb27SDimitry Andric // OMPTODO: different style of comments (see kmp_sched)
2606c3fb27SDimitry Andric // OMPTODO: OMPT/OMPD
2706c3fb27SDimitry Andric 
2806c3fb27SDimitry Andric // avoid inadevertently using a library based abs
2906c3fb27SDimitry Andric template <typename T> T __kmp_abs(const T val) {
3006c3fb27SDimitry Andric   return (val < 0) ? -val : val;
3106c3fb27SDimitry Andric }
3206c3fb27SDimitry Andric kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; }
3306c3fb27SDimitry Andric kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }
3406c3fb27SDimitry Andric 
3506c3fb27SDimitry Andric //----------------------------------------------------------------------------
3606c3fb27SDimitry Andric // Common functions for working with rectangular and non-rectangular loops
3706c3fb27SDimitry Andric //----------------------------------------------------------------------------
3806c3fb27SDimitry Andric 
395f757f3fSDimitry Andric template <typename T> int __kmp_sign(T val) {
405f757f3fSDimitry Andric   return (T(0) < val) - (val < T(0));
415f757f3fSDimitry Andric }
425f757f3fSDimitry Andric 
435f757f3fSDimitry Andric template <typename T> class CollapseAllocator {
445f757f3fSDimitry Andric   typedef T *pT;
455f757f3fSDimitry Andric 
465f757f3fSDimitry Andric private:
475f757f3fSDimitry Andric   static const size_t allocaSize = 32; // size limit for stack allocations
485f757f3fSDimitry Andric                                        // (8 bytes x 4 nested loops)
495f757f3fSDimitry Andric   char stackAlloc[allocaSize];
505f757f3fSDimitry Andric   static constexpr size_t maxElemCount = allocaSize / sizeof(T);
515f757f3fSDimitry Andric   pT pTAlloc;
525f757f3fSDimitry Andric 
535f757f3fSDimitry Andric public:
545f757f3fSDimitry Andric   CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) {
555f757f3fSDimitry Andric     if (n > maxElemCount) {
565f757f3fSDimitry Andric       pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T)));
575f757f3fSDimitry Andric     }
585f757f3fSDimitry Andric   }
595f757f3fSDimitry Andric   ~CollapseAllocator() {
605f757f3fSDimitry Andric     if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) {
615f757f3fSDimitry Andric       __kmp_free(pTAlloc);
625f757f3fSDimitry Andric     }
635f757f3fSDimitry Andric   }
645f757f3fSDimitry Andric   T &operator[](int index) { return pTAlloc[index]; }
655f757f3fSDimitry Andric   operator const pT() { return pTAlloc; }
665f757f3fSDimitry Andric };
6706c3fb27SDimitry Andric 
6806c3fb27SDimitry Andric //----------Loop canonicalization---------------------------------------------
6906c3fb27SDimitry Andric 
7006c3fb27SDimitry Andric // For loop nest (any shape):
7106c3fb27SDimitry Andric // convert != to < or >;
7206c3fb27SDimitry Andric // switch from using < or > to <= or >=.
7306c3fb27SDimitry Andric // "bounds" array has to be allocated per thread.
7406c3fb27SDimitry Andric // All other internal functions will work only with canonicalized loops.
7506c3fb27SDimitry Andric template <typename T>
7606c3fb27SDimitry Andric void kmp_canonicalize_one_loop_XX(
7706c3fb27SDimitry Andric     ident_t *loc,
7806c3fb27SDimitry Andric     /*in/out*/ bounds_infoXX_template<T> *bounds) {
7906c3fb27SDimitry Andric 
8006c3fb27SDimitry Andric   if (__kmp_env_consistency_check) {
8106c3fb27SDimitry Andric     if (bounds->step == 0) {
8206c3fb27SDimitry Andric       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
8306c3fb27SDimitry Andric                             loc);
8406c3fb27SDimitry Andric     }
8506c3fb27SDimitry Andric   }
8606c3fb27SDimitry Andric 
8706c3fb27SDimitry Andric   if (bounds->comparison == comparison_t::comp_not_eq) {
8806c3fb27SDimitry Andric     // We can convert this to < or >, depends on the sign of the step:
8906c3fb27SDimitry Andric     if (bounds->step > 0) {
9006c3fb27SDimitry Andric       bounds->comparison = comparison_t::comp_less;
9106c3fb27SDimitry Andric     } else {
9206c3fb27SDimitry Andric       bounds->comparison = comparison_t::comp_greater;
9306c3fb27SDimitry Andric     }
9406c3fb27SDimitry Andric   }
9506c3fb27SDimitry Andric 
9606c3fb27SDimitry Andric   if (bounds->comparison == comparison_t::comp_less) {
9706c3fb27SDimitry Andric     // Note: ub0 can be unsigned. Should be Ok to hit overflow here,
9806c3fb27SDimitry Andric     // because ub0 + ub1*j should be still positive (otherwise loop was not
9906c3fb27SDimitry Andric     // well formed)
10006c3fb27SDimitry Andric     bounds->ub0 -= 1;
10106c3fb27SDimitry Andric     bounds->comparison = comparison_t::comp_less_or_eq;
10206c3fb27SDimitry Andric   } else if (bounds->comparison == comparison_t::comp_greater) {
10306c3fb27SDimitry Andric     bounds->ub0 += 1;
10406c3fb27SDimitry Andric     bounds->comparison = comparison_t::comp_greater_or_eq;
10506c3fb27SDimitry Andric   }
10606c3fb27SDimitry Andric }
10706c3fb27SDimitry Andric 
10806c3fb27SDimitry Andric // Canonicalize loop nest. original_bounds_nest is an array of length n.
10906c3fb27SDimitry Andric void kmp_canonicalize_loop_nest(ident_t *loc,
11006c3fb27SDimitry Andric                                 /*in/out*/ bounds_info_t *original_bounds_nest,
11106c3fb27SDimitry Andric                                 kmp_index_t n) {
11206c3fb27SDimitry Andric 
11306c3fb27SDimitry Andric   for (kmp_index_t ind = 0; ind < n; ++ind) {
11406c3fb27SDimitry Andric     auto bounds = &(original_bounds_nest[ind]);
11506c3fb27SDimitry Andric 
11606c3fb27SDimitry Andric     switch (bounds->loop_type) {
11706c3fb27SDimitry Andric     case loop_type_t::loop_type_int32:
11806c3fb27SDimitry Andric       kmp_canonicalize_one_loop_XX<kmp_int32>(
11906c3fb27SDimitry Andric           loc,
12006c3fb27SDimitry Andric           /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
12106c3fb27SDimitry Andric       break;
12206c3fb27SDimitry Andric     case loop_type_t::loop_type_uint32:
12306c3fb27SDimitry Andric       kmp_canonicalize_one_loop_XX<kmp_uint32>(
12406c3fb27SDimitry Andric           loc,
12506c3fb27SDimitry Andric           /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
12606c3fb27SDimitry Andric       break;
12706c3fb27SDimitry Andric     case loop_type_t::loop_type_int64:
12806c3fb27SDimitry Andric       kmp_canonicalize_one_loop_XX<kmp_int64>(
12906c3fb27SDimitry Andric           loc,
13006c3fb27SDimitry Andric           /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
13106c3fb27SDimitry Andric       break;
13206c3fb27SDimitry Andric     case loop_type_t::loop_type_uint64:
13306c3fb27SDimitry Andric       kmp_canonicalize_one_loop_XX<kmp_uint64>(
13406c3fb27SDimitry Andric           loc,
13506c3fb27SDimitry Andric           /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
13606c3fb27SDimitry Andric       break;
13706c3fb27SDimitry Andric     default:
13806c3fb27SDimitry Andric       KMP_ASSERT(false);
13906c3fb27SDimitry Andric     }
14006c3fb27SDimitry Andric   }
14106c3fb27SDimitry Andric }
14206c3fb27SDimitry Andric 
14306c3fb27SDimitry Andric //----------Calculating trip count on one level-------------------------------
14406c3fb27SDimitry Andric 
14506c3fb27SDimitry Andric // Calculate trip count on this loop level.
14606c3fb27SDimitry Andric // We do this either for a rectangular loop nest,
14706c3fb27SDimitry Andric // or after an adjustment bringing the loops to a parallelepiped shape.
14806c3fb27SDimitry Andric // This number should not depend on the value of outer IV
14906c3fb27SDimitry Andric // even if the formular has lb1 and ub1.
15006c3fb27SDimitry Andric // Note: for non-rectangular loops don't use span for this, it's too big.
15106c3fb27SDimitry Andric 
15206c3fb27SDimitry Andric template <typename T>
15306c3fb27SDimitry Andric kmp_loop_nest_iv_t kmp_calculate_trip_count_XX(
15406c3fb27SDimitry Andric     /*in/out*/ bounds_infoXX_template<T> *bounds) {
15506c3fb27SDimitry Andric 
15606c3fb27SDimitry Andric   if (bounds->comparison == comparison_t::comp_less_or_eq) {
15706c3fb27SDimitry Andric     if (bounds->ub0 < bounds->lb0) {
15806c3fb27SDimitry Andric       // Note: after this we don't need to calculate inner loops,
15906c3fb27SDimitry Andric       // but that should be an edge case:
16006c3fb27SDimitry Andric       bounds->trip_count = 0;
16106c3fb27SDimitry Andric     } else {
16206c3fb27SDimitry Andric       // ub - lb may exceed signed type range; we need to cast to
16306c3fb27SDimitry Andric       // kmp_loop_nest_iv_t anyway
16406c3fb27SDimitry Andric       bounds->trip_count =
16506c3fb27SDimitry Andric           static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) /
16606c3fb27SDimitry Andric               __kmp_abs(bounds->step) +
16706c3fb27SDimitry Andric           1;
16806c3fb27SDimitry Andric     }
16906c3fb27SDimitry Andric   } else if (bounds->comparison == comparison_t::comp_greater_or_eq) {
17006c3fb27SDimitry Andric     if (bounds->lb0 < bounds->ub0) {
17106c3fb27SDimitry Andric       // Note: after this we don't need to calculate inner loops,
17206c3fb27SDimitry Andric       // but that should be an edge case:
17306c3fb27SDimitry Andric       bounds->trip_count = 0;
17406c3fb27SDimitry Andric     } else {
17506c3fb27SDimitry Andric       // lb - ub may exceed signed type range; we need to cast to
17606c3fb27SDimitry Andric       // kmp_loop_nest_iv_t anyway
17706c3fb27SDimitry Andric       bounds->trip_count =
17806c3fb27SDimitry Andric           static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) /
17906c3fb27SDimitry Andric               __kmp_abs(bounds->step) +
18006c3fb27SDimitry Andric           1;
18106c3fb27SDimitry Andric     }
18206c3fb27SDimitry Andric   } else {
18306c3fb27SDimitry Andric     KMP_ASSERT(false);
18406c3fb27SDimitry Andric   }
18506c3fb27SDimitry Andric   return bounds->trip_count;
18606c3fb27SDimitry Andric }
18706c3fb27SDimitry Andric 
18806c3fb27SDimitry Andric // Calculate trip count on this loop level.
18906c3fb27SDimitry Andric kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) {
19006c3fb27SDimitry Andric 
19106c3fb27SDimitry Andric   kmp_loop_nest_iv_t trip_count = 0;
19206c3fb27SDimitry Andric 
19306c3fb27SDimitry Andric   switch (bounds->loop_type) {
19406c3fb27SDimitry Andric   case loop_type_t::loop_type_int32:
19506c3fb27SDimitry Andric     trip_count = kmp_calculate_trip_count_XX<kmp_int32>(
19606c3fb27SDimitry Andric         /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
19706c3fb27SDimitry Andric     break;
19806c3fb27SDimitry Andric   case loop_type_t::loop_type_uint32:
19906c3fb27SDimitry Andric     trip_count = kmp_calculate_trip_count_XX<kmp_uint32>(
20006c3fb27SDimitry Andric         /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
20106c3fb27SDimitry Andric     break;
20206c3fb27SDimitry Andric   case loop_type_t::loop_type_int64:
20306c3fb27SDimitry Andric     trip_count = kmp_calculate_trip_count_XX<kmp_int64>(
20406c3fb27SDimitry Andric         /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
20506c3fb27SDimitry Andric     break;
20606c3fb27SDimitry Andric   case loop_type_t::loop_type_uint64:
20706c3fb27SDimitry Andric     trip_count = kmp_calculate_trip_count_XX<kmp_uint64>(
20806c3fb27SDimitry Andric         /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
20906c3fb27SDimitry Andric     break;
21006c3fb27SDimitry Andric   default:
21106c3fb27SDimitry Andric     KMP_ASSERT(false);
21206c3fb27SDimitry Andric   }
21306c3fb27SDimitry Andric 
21406c3fb27SDimitry Andric   return trip_count;
21506c3fb27SDimitry Andric }
21606c3fb27SDimitry Andric 
21706c3fb27SDimitry Andric //----------Trim original iv according to its type----------------------------
21806c3fb27SDimitry Andric 
21906c3fb27SDimitry Andric // Trim original iv according to its type.
22006c3fb27SDimitry Andric // Return kmp_uint64 value which can be easily used in all internal calculations
22106c3fb27SDimitry Andric // And can be statically cast back to original type in user code.
22206c3fb27SDimitry Andric kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) {
22306c3fb27SDimitry Andric   kmp_uint64 res = 0;
22406c3fb27SDimitry Andric 
22506c3fb27SDimitry Andric   switch (loop_iv_type) {
22606c3fb27SDimitry Andric   case loop_type_t::loop_type_int8:
22706c3fb27SDimitry Andric     res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv));
22806c3fb27SDimitry Andric     break;
22906c3fb27SDimitry Andric   case loop_type_t::loop_type_uint8:
23006c3fb27SDimitry Andric     res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv));
23106c3fb27SDimitry Andric     break;
23206c3fb27SDimitry Andric   case loop_type_t::loop_type_int16:
23306c3fb27SDimitry Andric     res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv));
23406c3fb27SDimitry Andric     break;
23506c3fb27SDimitry Andric   case loop_type_t::loop_type_uint16:
23606c3fb27SDimitry Andric     res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv));
23706c3fb27SDimitry Andric     break;
23806c3fb27SDimitry Andric   case loop_type_t::loop_type_int32:
23906c3fb27SDimitry Andric     res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv));
24006c3fb27SDimitry Andric     break;
24106c3fb27SDimitry Andric   case loop_type_t::loop_type_uint32:
24206c3fb27SDimitry Andric     res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv));
24306c3fb27SDimitry Andric     break;
24406c3fb27SDimitry Andric   case loop_type_t::loop_type_int64:
24506c3fb27SDimitry Andric     res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv));
24606c3fb27SDimitry Andric     break;
24706c3fb27SDimitry Andric   case loop_type_t::loop_type_uint64:
24806c3fb27SDimitry Andric     res = static_cast<kmp_uint64>(original_iv);
24906c3fb27SDimitry Andric     break;
25006c3fb27SDimitry Andric   default:
25106c3fb27SDimitry Andric     KMP_ASSERT(false);
25206c3fb27SDimitry Andric   }
25306c3fb27SDimitry Andric 
25406c3fb27SDimitry Andric   return res;
25506c3fb27SDimitry Andric }
25606c3fb27SDimitry Andric 
25706c3fb27SDimitry Andric //----------Compare two IVs (remember they have a type)-----------------------
25806c3fb27SDimitry Andric 
25906c3fb27SDimitry Andric bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1,
26006c3fb27SDimitry Andric                 kmp_uint64 original_iv2) {
26106c3fb27SDimitry Andric   bool res = false;
26206c3fb27SDimitry Andric 
26306c3fb27SDimitry Andric   switch (loop_iv_type) {
26406c3fb27SDimitry Andric   case loop_type_t::loop_type_int8:
26506c3fb27SDimitry Andric     res = static_cast<kmp_int8>(original_iv1) ==
26606c3fb27SDimitry Andric           static_cast<kmp_int8>(original_iv2);
26706c3fb27SDimitry Andric     break;
26806c3fb27SDimitry Andric   case loop_type_t::loop_type_uint8:
26906c3fb27SDimitry Andric     res = static_cast<kmp_uint8>(original_iv1) ==
27006c3fb27SDimitry Andric           static_cast<kmp_uint8>(original_iv2);
27106c3fb27SDimitry Andric     break;
27206c3fb27SDimitry Andric   case loop_type_t::loop_type_int16:
27306c3fb27SDimitry Andric     res = static_cast<kmp_int16>(original_iv1) ==
27406c3fb27SDimitry Andric           static_cast<kmp_int16>(original_iv2);
27506c3fb27SDimitry Andric     break;
27606c3fb27SDimitry Andric   case loop_type_t::loop_type_uint16:
27706c3fb27SDimitry Andric     res = static_cast<kmp_uint16>(original_iv1) ==
27806c3fb27SDimitry Andric           static_cast<kmp_uint16>(original_iv2);
27906c3fb27SDimitry Andric     break;
28006c3fb27SDimitry Andric   case loop_type_t::loop_type_int32:
28106c3fb27SDimitry Andric     res = static_cast<kmp_int32>(original_iv1) ==
28206c3fb27SDimitry Andric           static_cast<kmp_int32>(original_iv2);
28306c3fb27SDimitry Andric     break;
28406c3fb27SDimitry Andric   case loop_type_t::loop_type_uint32:
28506c3fb27SDimitry Andric     res = static_cast<kmp_uint32>(original_iv1) ==
28606c3fb27SDimitry Andric           static_cast<kmp_uint32>(original_iv2);
28706c3fb27SDimitry Andric     break;
28806c3fb27SDimitry Andric   case loop_type_t::loop_type_int64:
28906c3fb27SDimitry Andric     res = static_cast<kmp_int64>(original_iv1) ==
29006c3fb27SDimitry Andric           static_cast<kmp_int64>(original_iv2);
29106c3fb27SDimitry Andric     break;
29206c3fb27SDimitry Andric   case loop_type_t::loop_type_uint64:
29306c3fb27SDimitry Andric     res = static_cast<kmp_uint64>(original_iv1) ==
29406c3fb27SDimitry Andric           static_cast<kmp_uint64>(original_iv2);
29506c3fb27SDimitry Andric     break;
29606c3fb27SDimitry Andric   default:
29706c3fb27SDimitry Andric     KMP_ASSERT(false);
29806c3fb27SDimitry Andric   }
29906c3fb27SDimitry Andric 
30006c3fb27SDimitry Andric   return res;
30106c3fb27SDimitry Andric }
30206c3fb27SDimitry Andric 
30306c3fb27SDimitry Andric //----------Calculate original iv on one level--------------------------------
30406c3fb27SDimitry Andric 
30506c3fb27SDimitry Andric // Return true if the point fits into upper bounds on this level,
30606c3fb27SDimitry Andric // false otherwise
30706c3fb27SDimitry Andric template <typename T>
30806c3fb27SDimitry Andric bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds,
30906c3fb27SDimitry Andric                                  const kmp_point_t original_ivs,
31006c3fb27SDimitry Andric                                  kmp_index_t ind) {
31106c3fb27SDimitry Andric 
31206c3fb27SDimitry Andric   T iv = static_cast<T>(original_ivs[ind]);
31306c3fb27SDimitry Andric   T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
31406c3fb27SDimitry Andric 
31506c3fb27SDimitry Andric   if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
31606c3fb27SDimitry Andric        (iv > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
31706c3fb27SDimitry Andric       ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
31806c3fb27SDimitry Andric        (iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
31906c3fb27SDimitry Andric     // The calculated point is outside of loop upper boundary:
32006c3fb27SDimitry Andric     return false;
32106c3fb27SDimitry Andric   }
32206c3fb27SDimitry Andric 
32306c3fb27SDimitry Andric   return true;
32406c3fb27SDimitry Andric }
32506c3fb27SDimitry Andric 
32606c3fb27SDimitry Andric // Calculate one iv corresponding to iteration on the level ind.
32706c3fb27SDimitry Andric // Return true if it fits into lower-upper bounds on this level
32806c3fb27SDimitry Andric // (if not, we need to re-calculate)
32906c3fb27SDimitry Andric template <typename T>
33006c3fb27SDimitry Andric bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds,
33106c3fb27SDimitry Andric                         /*in/out*/ kmp_point_t original_ivs,
33206c3fb27SDimitry Andric                         const kmp_iterations_t iterations, kmp_index_t ind,
33306c3fb27SDimitry Andric                         bool start_with_lower_bound, bool checkBounds) {
33406c3fb27SDimitry Andric 
33506c3fb27SDimitry Andric   kmp_uint64 temp = 0;
33606c3fb27SDimitry Andric   T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
33706c3fb27SDimitry Andric 
33806c3fb27SDimitry Andric   if (start_with_lower_bound) {
33906c3fb27SDimitry Andric     // we moved to the next iteration on one of outer loops, should start
34006c3fb27SDimitry Andric     // with the lower bound here:
34106c3fb27SDimitry Andric     temp = bounds->lb0 + bounds->lb1 * outer_iv;
34206c3fb27SDimitry Andric   } else {
34306c3fb27SDimitry Andric     auto iteration = iterations[ind];
34406c3fb27SDimitry Andric     temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step;
34506c3fb27SDimitry Andric   }
34606c3fb27SDimitry Andric 
34706c3fb27SDimitry Andric   // Now trim original iv according to its type:
34806c3fb27SDimitry Andric   original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
34906c3fb27SDimitry Andric 
35006c3fb27SDimitry Andric   if (checkBounds) {
35106c3fb27SDimitry Andric     return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind);
35206c3fb27SDimitry Andric   } else {
35306c3fb27SDimitry Andric     return true;
35406c3fb27SDimitry Andric   }
35506c3fb27SDimitry Andric }
35606c3fb27SDimitry Andric 
35706c3fb27SDimitry Andric bool kmp_calc_one_iv(const bounds_info_t *bounds,
35806c3fb27SDimitry Andric                      /*in/out*/ kmp_point_t original_ivs,
35906c3fb27SDimitry Andric                      const kmp_iterations_t iterations, kmp_index_t ind,
36006c3fb27SDimitry Andric                      bool start_with_lower_bound, bool checkBounds) {
36106c3fb27SDimitry Andric 
36206c3fb27SDimitry Andric   switch (bounds->loop_type) {
36306c3fb27SDimitry Andric   case loop_type_t::loop_type_int32:
36406c3fb27SDimitry Andric     return kmp_calc_one_iv_XX<kmp_int32>(
36506c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int32> *)(bounds),
36606c3fb27SDimitry Andric         /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
36706c3fb27SDimitry Andric         checkBounds);
36806c3fb27SDimitry Andric     break;
36906c3fb27SDimitry Andric   case loop_type_t::loop_type_uint32:
37006c3fb27SDimitry Andric     return kmp_calc_one_iv_XX<kmp_uint32>(
37106c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint32> *)(bounds),
37206c3fb27SDimitry Andric         /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
37306c3fb27SDimitry Andric         checkBounds);
37406c3fb27SDimitry Andric     break;
37506c3fb27SDimitry Andric   case loop_type_t::loop_type_int64:
37606c3fb27SDimitry Andric     return kmp_calc_one_iv_XX<kmp_int64>(
37706c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int64> *)(bounds),
37806c3fb27SDimitry Andric         /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
37906c3fb27SDimitry Andric         checkBounds);
38006c3fb27SDimitry Andric     break;
38106c3fb27SDimitry Andric   case loop_type_t::loop_type_uint64:
38206c3fb27SDimitry Andric     return kmp_calc_one_iv_XX<kmp_uint64>(
38306c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint64> *)(bounds),
38406c3fb27SDimitry Andric         /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
38506c3fb27SDimitry Andric         checkBounds);
38606c3fb27SDimitry Andric     break;
38706c3fb27SDimitry Andric   default:
38806c3fb27SDimitry Andric     KMP_ASSERT(false);
38906c3fb27SDimitry Andric     return false;
39006c3fb27SDimitry Andric   }
39106c3fb27SDimitry Andric }
39206c3fb27SDimitry Andric 
39306c3fb27SDimitry Andric //----------Calculate original iv on one level for rectangular loop nest------
39406c3fb27SDimitry Andric 
39506c3fb27SDimitry Andric // Calculate one iv corresponding to iteration on the level ind.
39606c3fb27SDimitry Andric // Return true if it fits into lower-upper bounds on this level
39706c3fb27SDimitry Andric // (if not, we need to re-calculate)
39806c3fb27SDimitry Andric template <typename T>
39906c3fb27SDimitry Andric void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds,
40006c3fb27SDimitry Andric                                 /*in/out*/ kmp_uint64 *original_ivs,
40106c3fb27SDimitry Andric                                 const kmp_iterations_t iterations,
40206c3fb27SDimitry Andric                                 kmp_index_t ind) {
40306c3fb27SDimitry Andric 
40406c3fb27SDimitry Andric   auto iteration = iterations[ind];
40506c3fb27SDimitry Andric 
40606c3fb27SDimitry Andric   kmp_uint64 temp =
40706c3fb27SDimitry Andric       bounds->lb0 +
40806c3fb27SDimitry Andric       bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) +
40906c3fb27SDimitry Andric       iteration * bounds->step;
41006c3fb27SDimitry Andric 
41106c3fb27SDimitry Andric   // Now trim original iv according to its type:
41206c3fb27SDimitry Andric   original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
41306c3fb27SDimitry Andric }
41406c3fb27SDimitry Andric 
41506c3fb27SDimitry Andric void kmp_calc_one_iv_rectang(const bounds_info_t *bounds,
41606c3fb27SDimitry Andric                              /*in/out*/ kmp_uint64 *original_ivs,
41706c3fb27SDimitry Andric                              const kmp_iterations_t iterations,
41806c3fb27SDimitry Andric                              kmp_index_t ind) {
41906c3fb27SDimitry Andric 
42006c3fb27SDimitry Andric   switch (bounds->loop_type) {
42106c3fb27SDimitry Andric   case loop_type_t::loop_type_int32:
42206c3fb27SDimitry Andric     kmp_calc_one_iv_rectang_XX<kmp_int32>(
42306c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int32> *)(bounds),
42406c3fb27SDimitry Andric         /*in/out*/ original_ivs, iterations, ind);
42506c3fb27SDimitry Andric     break;
42606c3fb27SDimitry Andric   case loop_type_t::loop_type_uint32:
42706c3fb27SDimitry Andric     kmp_calc_one_iv_rectang_XX<kmp_uint32>(
42806c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint32> *)(bounds),
42906c3fb27SDimitry Andric         /*in/out*/ original_ivs, iterations, ind);
43006c3fb27SDimitry Andric     break;
43106c3fb27SDimitry Andric   case loop_type_t::loop_type_int64:
43206c3fb27SDimitry Andric     kmp_calc_one_iv_rectang_XX<kmp_int64>(
43306c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int64> *)(bounds),
43406c3fb27SDimitry Andric         /*in/out*/ original_ivs, iterations, ind);
43506c3fb27SDimitry Andric     break;
43606c3fb27SDimitry Andric   case loop_type_t::loop_type_uint64:
43706c3fb27SDimitry Andric     kmp_calc_one_iv_rectang_XX<kmp_uint64>(
43806c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint64> *)(bounds),
43906c3fb27SDimitry Andric         /*in/out*/ original_ivs, iterations, ind);
44006c3fb27SDimitry Andric     break;
44106c3fb27SDimitry Andric   default:
44206c3fb27SDimitry Andric     KMP_ASSERT(false);
44306c3fb27SDimitry Andric   }
44406c3fb27SDimitry Andric }
44506c3fb27SDimitry Andric 
44606c3fb27SDimitry Andric //----------------------------------------------------------------------------
44706c3fb27SDimitry Andric // Rectangular loop nest
44806c3fb27SDimitry Andric //----------------------------------------------------------------------------
44906c3fb27SDimitry Andric 
45006c3fb27SDimitry Andric //----------Canonicalize loop nest and calculate trip count-------------------
45106c3fb27SDimitry Andric 
45206c3fb27SDimitry Andric // Canonicalize loop nest and calculate overall trip count.
45306c3fb27SDimitry Andric // "bounds_nest" has to be allocated per thread.
45406c3fb27SDimitry Andric // API will modify original bounds_nest array to bring it to a canonical form
45506c3fb27SDimitry Andric // (only <= and >=, no !=, <, >). If the original loop nest was already in a
45606c3fb27SDimitry Andric // canonical form there will be no changes to bounds in bounds_nest array
45706c3fb27SDimitry Andric // (only trip counts will be calculated).
45806c3fb27SDimitry Andric // Returns trip count of overall space.
45906c3fb27SDimitry Andric extern "C" kmp_loop_nest_iv_t
46006c3fb27SDimitry Andric __kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
46106c3fb27SDimitry Andric                                  /*in/out*/ bounds_info_t *original_bounds_nest,
46206c3fb27SDimitry Andric                                  kmp_index_t n) {
46306c3fb27SDimitry Andric 
46406c3fb27SDimitry Andric   kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
46506c3fb27SDimitry Andric 
46606c3fb27SDimitry Andric   kmp_loop_nest_iv_t total = 1;
46706c3fb27SDimitry Andric 
46806c3fb27SDimitry Andric   for (kmp_index_t ind = 0; ind < n; ++ind) {
46906c3fb27SDimitry Andric     auto bounds = &(original_bounds_nest[ind]);
47006c3fb27SDimitry Andric 
47106c3fb27SDimitry Andric     kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds);
47206c3fb27SDimitry Andric     total *= trip_count;
47306c3fb27SDimitry Andric   }
47406c3fb27SDimitry Andric 
47506c3fb27SDimitry Andric   return total;
47606c3fb27SDimitry Andric }
47706c3fb27SDimitry Andric 
47806c3fb27SDimitry Andric //----------Calculate old induction variables---------------------------------
47906c3fb27SDimitry Andric 
48006c3fb27SDimitry Andric // Calculate old induction variables corresponding to overall new_iv.
48106c3fb27SDimitry Andric // Note: original IV will be returned as if it had kmp_uint64 type,
48206c3fb27SDimitry Andric // will have to be converted to original type in user code.
48306c3fb27SDimitry Andric // Note: trip counts should be already calculated by
48406c3fb27SDimitry Andric // __kmpc_process_loop_nest_rectang.
48506c3fb27SDimitry Andric // OMPTODO: special case 2, 3 nested loops: either do different
48606c3fb27SDimitry Andric // interface without array or possibly template this over n
48706c3fb27SDimitry Andric extern "C" void
48806c3fb27SDimitry Andric __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
48906c3fb27SDimitry Andric                                  const bounds_info_t *original_bounds_nest,
49006c3fb27SDimitry Andric                                  /*out*/ kmp_uint64 *original_ivs,
49106c3fb27SDimitry Andric                                  kmp_index_t n) {
49206c3fb27SDimitry Andric 
4935f757f3fSDimitry Andric   CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
49406c3fb27SDimitry Andric 
49506c3fb27SDimitry Andric   // First, calc corresponding iteration in every original loop:
49606c3fb27SDimitry Andric   for (kmp_index_t ind = n; ind > 0;) {
49706c3fb27SDimitry Andric     --ind;
49806c3fb27SDimitry Andric     auto bounds = &(original_bounds_nest[ind]);
49906c3fb27SDimitry Andric 
50006c3fb27SDimitry Andric     // should be optimized to OPDIVREM:
50106c3fb27SDimitry Andric     auto temp = new_iv / bounds->trip_count;
50206c3fb27SDimitry Andric     auto iteration = new_iv % bounds->trip_count;
50306c3fb27SDimitry Andric     new_iv = temp;
50406c3fb27SDimitry Andric 
50506c3fb27SDimitry Andric     iterations[ind] = iteration;
50606c3fb27SDimitry Andric   }
50706c3fb27SDimitry Andric   KMP_ASSERT(new_iv == 0);
50806c3fb27SDimitry Andric 
50906c3fb27SDimitry Andric   for (kmp_index_t ind = 0; ind < n; ++ind) {
51006c3fb27SDimitry Andric     auto bounds = &(original_bounds_nest[ind]);
51106c3fb27SDimitry Andric 
51206c3fb27SDimitry Andric     kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind);
51306c3fb27SDimitry Andric   }
51406c3fb27SDimitry Andric }
51506c3fb27SDimitry Andric 
51606c3fb27SDimitry Andric //----------------------------------------------------------------------------
51706c3fb27SDimitry Andric // Non-rectangular loop nest
51806c3fb27SDimitry Andric //----------------------------------------------------------------------------
51906c3fb27SDimitry Andric 
52006c3fb27SDimitry Andric //----------Calculate maximum possible span of iv values on one level---------
52106c3fb27SDimitry Andric 
52206c3fb27SDimitry Andric // Calculate span for IV on this loop level for "<=" case.
52306c3fb27SDimitry Andric // Note: it's for <= on this loop nest level, so lower bound should be smallest
52406c3fb27SDimitry Andric // value, upper bound should be the biggest value. If the loop won't execute,
52506c3fb27SDimitry Andric // 'smallest' may be bigger than 'biggest', but we'd better not switch them
52606c3fb27SDimitry Andric // around.
52706c3fb27SDimitry Andric template <typename T>
52806c3fb27SDimitry Andric void kmp_calc_span_lessoreq_XX(
52906c3fb27SDimitry Andric     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
53006c3fb27SDimitry Andric     /* in/out*/ bounds_info_internal_t *bounds_nest) {
53106c3fb27SDimitry Andric 
53206c3fb27SDimitry Andric   typedef typename traits_t<T>::unsigned_t UT;
53306c3fb27SDimitry Andric   // typedef typename traits_t<T>::signed_t ST;
53406c3fb27SDimitry Andric 
53506c3fb27SDimitry Andric   // typedef typename big_span_t span_t;
53606c3fb27SDimitry Andric   typedef T span_t;
53706c3fb27SDimitry Andric 
53806c3fb27SDimitry Andric   auto &bbounds = bounds->b;
53906c3fb27SDimitry Andric 
54006c3fb27SDimitry Andric   if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
54106c3fb27SDimitry Andric     // This dimention depends on one of previous ones; can't be the outermost
54206c3fb27SDimitry Andric     // one.
54306c3fb27SDimitry Andric     bounds_info_internalXX_template<T> *previous =
54406c3fb27SDimitry Andric         reinterpret_cast<bounds_info_internalXX_template<T> *>(
54506c3fb27SDimitry Andric             &(bounds_nest[bbounds.outer_iv]));
54606c3fb27SDimitry Andric 
54706c3fb27SDimitry Andric     // OMPTODO: assert that T is compatible with loop variable type on
54806c3fb27SDimitry Andric     // 'previous' loop
54906c3fb27SDimitry Andric 
55006c3fb27SDimitry Andric     {
55106c3fb27SDimitry Andric       span_t bound_candidate1 =
55206c3fb27SDimitry Andric           bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
55306c3fb27SDimitry Andric       span_t bound_candidate2 =
55406c3fb27SDimitry Andric           bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
55506c3fb27SDimitry Andric       if (bound_candidate1 < bound_candidate2) {
55606c3fb27SDimitry Andric         bounds->span_smallest = bound_candidate1;
55706c3fb27SDimitry Andric       } else {
55806c3fb27SDimitry Andric         bounds->span_smallest = bound_candidate2;
55906c3fb27SDimitry Andric       }
56006c3fb27SDimitry Andric     }
56106c3fb27SDimitry Andric 
56206c3fb27SDimitry Andric     {
56306c3fb27SDimitry Andric       // We can't adjust the upper bound with respect to step, because
56406c3fb27SDimitry Andric       // lower bound might be off after adjustments
56506c3fb27SDimitry Andric 
56606c3fb27SDimitry Andric       span_t bound_candidate1 =
56706c3fb27SDimitry Andric           bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
56806c3fb27SDimitry Andric       span_t bound_candidate2 =
56906c3fb27SDimitry Andric           bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
57006c3fb27SDimitry Andric       if (bound_candidate1 < bound_candidate2) {
57106c3fb27SDimitry Andric         bounds->span_biggest = bound_candidate2;
57206c3fb27SDimitry Andric       } else {
57306c3fb27SDimitry Andric         bounds->span_biggest = bound_candidate1;
57406c3fb27SDimitry Andric       }
57506c3fb27SDimitry Andric     }
57606c3fb27SDimitry Andric   } else {
57706c3fb27SDimitry Andric     // Rectangular:
57806c3fb27SDimitry Andric     bounds->span_smallest = bbounds.lb0;
57906c3fb27SDimitry Andric     bounds->span_biggest = bbounds.ub0;
58006c3fb27SDimitry Andric   }
58106c3fb27SDimitry Andric   if (!bounds->loop_bounds_adjusted) {
58206c3fb27SDimitry Andric     // Here it's safe to reduce the space to the multiply of step.
58306c3fb27SDimitry Andric     // OMPTODO: check if the formular is correct.
58406c3fb27SDimitry Andric     // Also check if it would be safe to do this if we didn't adjust left side.
58506c3fb27SDimitry Andric     bounds->span_biggest -=
58606c3fb27SDimitry Andric         (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
58706c3fb27SDimitry Andric   }
58806c3fb27SDimitry Andric }
58906c3fb27SDimitry Andric 
59006c3fb27SDimitry Andric // Calculate span for IV on this loop level for ">=" case.
59106c3fb27SDimitry Andric template <typename T>
59206c3fb27SDimitry Andric void kmp_calc_span_greateroreq_XX(
59306c3fb27SDimitry Andric     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
59406c3fb27SDimitry Andric     /* in/out*/ bounds_info_internal_t *bounds_nest) {
59506c3fb27SDimitry Andric 
59606c3fb27SDimitry Andric   typedef typename traits_t<T>::unsigned_t UT;
59706c3fb27SDimitry Andric   // typedef typename traits_t<T>::signed_t ST;
59806c3fb27SDimitry Andric 
59906c3fb27SDimitry Andric   // typedef typename big_span_t span_t;
60006c3fb27SDimitry Andric   typedef T span_t;
60106c3fb27SDimitry Andric 
60206c3fb27SDimitry Andric   auto &bbounds = bounds->b;
60306c3fb27SDimitry Andric 
60406c3fb27SDimitry Andric   if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
60506c3fb27SDimitry Andric     // This dimention depends on one of previous ones; can't be the outermost
60606c3fb27SDimitry Andric     // one.
60706c3fb27SDimitry Andric     bounds_info_internalXX_template<T> *previous =
60806c3fb27SDimitry Andric         reinterpret_cast<bounds_info_internalXX_template<T> *>(
60906c3fb27SDimitry Andric             &(bounds_nest[bbounds.outer_iv]));
61006c3fb27SDimitry Andric 
61106c3fb27SDimitry Andric     // OMPTODO: assert that T is compatible with loop variable type on
61206c3fb27SDimitry Andric     // 'previous' loop
61306c3fb27SDimitry Andric 
61406c3fb27SDimitry Andric     {
61506c3fb27SDimitry Andric       span_t bound_candidate1 =
61606c3fb27SDimitry Andric           bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
61706c3fb27SDimitry Andric       span_t bound_candidate2 =
61806c3fb27SDimitry Andric           bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
61906c3fb27SDimitry Andric       if (bound_candidate1 >= bound_candidate2) {
62006c3fb27SDimitry Andric         bounds->span_smallest = bound_candidate1;
62106c3fb27SDimitry Andric       } else {
62206c3fb27SDimitry Andric         bounds->span_smallest = bound_candidate2;
62306c3fb27SDimitry Andric       }
62406c3fb27SDimitry Andric     }
62506c3fb27SDimitry Andric 
62606c3fb27SDimitry Andric     {
62706c3fb27SDimitry Andric       // We can't adjust the upper bound with respect to step, because
62806c3fb27SDimitry Andric       // lower bound might be off after adjustments
62906c3fb27SDimitry Andric 
63006c3fb27SDimitry Andric       span_t bound_candidate1 =
63106c3fb27SDimitry Andric           bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
63206c3fb27SDimitry Andric       span_t bound_candidate2 =
63306c3fb27SDimitry Andric           bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
63406c3fb27SDimitry Andric       if (bound_candidate1 >= bound_candidate2) {
63506c3fb27SDimitry Andric         bounds->span_biggest = bound_candidate2;
63606c3fb27SDimitry Andric       } else {
63706c3fb27SDimitry Andric         bounds->span_biggest = bound_candidate1;
63806c3fb27SDimitry Andric       }
63906c3fb27SDimitry Andric     }
64006c3fb27SDimitry Andric 
64106c3fb27SDimitry Andric   } else {
64206c3fb27SDimitry Andric     // Rectangular:
64306c3fb27SDimitry Andric     bounds->span_biggest = bbounds.lb0;
64406c3fb27SDimitry Andric     bounds->span_smallest = bbounds.ub0;
64506c3fb27SDimitry Andric   }
64606c3fb27SDimitry Andric   if (!bounds->loop_bounds_adjusted) {
64706c3fb27SDimitry Andric     // Here it's safe to reduce the space to the multiply of step.
64806c3fb27SDimitry Andric     // OMPTODO: check if the formular is correct.
64906c3fb27SDimitry Andric     // Also check if it would be safe to do this if we didn't adjust left side.
65006c3fb27SDimitry Andric     bounds->span_biggest -=
65106c3fb27SDimitry Andric         (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
65206c3fb27SDimitry Andric   }
65306c3fb27SDimitry Andric }
65406c3fb27SDimitry Andric 
65506c3fb27SDimitry Andric // Calculate maximum possible span for IV on this loop level.
65606c3fb27SDimitry Andric template <typename T>
65706c3fb27SDimitry Andric void kmp_calc_span_XX(
65806c3fb27SDimitry Andric     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
65906c3fb27SDimitry Andric     /* in/out*/ bounds_info_internal_t *bounds_nest) {
66006c3fb27SDimitry Andric 
66106c3fb27SDimitry Andric   if (bounds->b.comparison == comparison_t::comp_less_or_eq) {
66206c3fb27SDimitry Andric     kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
66306c3fb27SDimitry Andric   } else {
66406c3fb27SDimitry Andric     KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq);
66506c3fb27SDimitry Andric     kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
66606c3fb27SDimitry Andric   }
66706c3fb27SDimitry Andric }
66806c3fb27SDimitry Andric 
66906c3fb27SDimitry Andric //----------All initial processing of the loop nest---------------------------
67006c3fb27SDimitry Andric 
67106c3fb27SDimitry Andric // Calculate new bounds for this loop level.
67206c3fb27SDimitry Andric // To be able to work with the nest we need to get it to a parallelepiped shape.
67306c3fb27SDimitry Andric // We need to stay in the original range of values, so that there will be no
67406c3fb27SDimitry Andric // overflow, for that we'll adjust both upper and lower bounds as needed.
67506c3fb27SDimitry Andric template <typename T>
67606c3fb27SDimitry Andric void kmp_calc_new_bounds_XX(
67706c3fb27SDimitry Andric     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
67806c3fb27SDimitry Andric     /* in/out*/ bounds_info_internal_t *bounds_nest) {
67906c3fb27SDimitry Andric 
68006c3fb27SDimitry Andric   auto &bbounds = bounds->b;
68106c3fb27SDimitry Andric 
68206c3fb27SDimitry Andric   if (bbounds.lb1 == bbounds.ub1) {
68306c3fb27SDimitry Andric     // Already parallel, no need to adjust:
68406c3fb27SDimitry Andric     bounds->loop_bounds_adjusted = false;
68506c3fb27SDimitry Andric   } else {
68606c3fb27SDimitry Andric     bounds->loop_bounds_adjusted = true;
68706c3fb27SDimitry Andric 
68806c3fb27SDimitry Andric     T old_lb1 = bbounds.lb1;
68906c3fb27SDimitry Andric     T old_ub1 = bbounds.ub1;
69006c3fb27SDimitry Andric 
69106c3fb27SDimitry Andric     if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) {
69206c3fb27SDimitry Andric       // With this shape we can adjust to a rectangle:
69306c3fb27SDimitry Andric       bbounds.lb1 = 0;
69406c3fb27SDimitry Andric       bbounds.ub1 = 0;
69506c3fb27SDimitry Andric     } else {
69606c3fb27SDimitry Andric       // get upper and lower bounds to be parallel
69706c3fb27SDimitry Andric       // with values in the old range.
69806c3fb27SDimitry Andric       // Note: abs didn't work here.
69906c3fb27SDimitry Andric       if (((old_lb1 < 0) && (old_lb1 < old_ub1)) ||
70006c3fb27SDimitry Andric           ((old_lb1 > 0) && (old_lb1 > old_ub1))) {
70106c3fb27SDimitry Andric         bbounds.lb1 = old_ub1;
70206c3fb27SDimitry Andric       } else {
70306c3fb27SDimitry Andric         bbounds.ub1 = old_lb1;
70406c3fb27SDimitry Andric       }
70506c3fb27SDimitry Andric     }
70606c3fb27SDimitry Andric 
70706c3fb27SDimitry Andric     // Now need to adjust lb0, ub0, otherwise in some cases space will shrink.
70806c3fb27SDimitry Andric     // The idea here that for this IV we are now getting the same span
70906c3fb27SDimitry Andric     // irrespective of the previous IV value.
71006c3fb27SDimitry Andric     bounds_info_internalXX_template<T> *previous =
71106c3fb27SDimitry Andric         reinterpret_cast<bounds_info_internalXX_template<T> *>(
71206c3fb27SDimitry Andric             &bounds_nest[bbounds.outer_iv]);
71306c3fb27SDimitry Andric 
71406c3fb27SDimitry Andric     if (bbounds.comparison == comparison_t::comp_less_or_eq) {
71506c3fb27SDimitry Andric       if (old_lb1 < bbounds.lb1) {
71606c3fb27SDimitry Andric         KMP_ASSERT(old_lb1 < 0);
71706c3fb27SDimitry Andric         // The length is good on outer_iv biggest number,
71806c3fb27SDimitry Andric         // can use it to find where to move the lower bound:
71906c3fb27SDimitry Andric 
72006c3fb27SDimitry Andric         T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest;
72106c3fb27SDimitry Andric         bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space?
72206c3fb27SDimitry Andric                             // e.g. it was 0?? (same below)
72306c3fb27SDimitry Andric       } else if (old_lb1 > bbounds.lb1) {
72406c3fb27SDimitry Andric         // still need to move lower bound:
72506c3fb27SDimitry Andric         T add = (old_lb1 - bbounds.lb1) * previous->span_smallest;
72606c3fb27SDimitry Andric         bbounds.lb0 += add;
72706c3fb27SDimitry Andric       }
72806c3fb27SDimitry Andric 
72906c3fb27SDimitry Andric       if (old_ub1 > bbounds.ub1) {
73006c3fb27SDimitry Andric         KMP_ASSERT(old_ub1 > 0);
73106c3fb27SDimitry Andric         // The length is good on outer_iv biggest number,
73206c3fb27SDimitry Andric         // can use it to find where to move upper bound:
73306c3fb27SDimitry Andric 
73406c3fb27SDimitry Andric         T add = (old_ub1 - bbounds.ub1) * previous->span_biggest;
73506c3fb27SDimitry Andric         bbounds.ub0 += add;
73606c3fb27SDimitry Andric       } else if (old_ub1 < bbounds.ub1) {
73706c3fb27SDimitry Andric         // still need to move upper bound:
73806c3fb27SDimitry Andric         T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest;
73906c3fb27SDimitry Andric         bbounds.ub0 -= sub;
74006c3fb27SDimitry Andric       }
74106c3fb27SDimitry Andric     } else {
74206c3fb27SDimitry Andric       KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq);
74306c3fb27SDimitry Andric       if (old_lb1 < bbounds.lb1) {
74406c3fb27SDimitry Andric         KMP_ASSERT(old_lb1 < 0);
74506c3fb27SDimitry Andric         T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest;
74606c3fb27SDimitry Andric         bbounds.lb0 -= sub;
74706c3fb27SDimitry Andric       } else if (old_lb1 > bbounds.lb1) {
74806c3fb27SDimitry Andric         T add = (old_lb1 - bbounds.lb1) * previous->span_biggest;
74906c3fb27SDimitry Andric         bbounds.lb0 += add;
75006c3fb27SDimitry Andric       }
75106c3fb27SDimitry Andric 
75206c3fb27SDimitry Andric       if (old_ub1 > bbounds.ub1) {
75306c3fb27SDimitry Andric         KMP_ASSERT(old_ub1 > 0);
75406c3fb27SDimitry Andric         T add = (old_ub1 - bbounds.ub1) * previous->span_smallest;
75506c3fb27SDimitry Andric         bbounds.ub0 += add;
75606c3fb27SDimitry Andric       } else if (old_ub1 < bbounds.ub1) {
75706c3fb27SDimitry Andric         T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest;
75806c3fb27SDimitry Andric         bbounds.ub0 -= sub;
75906c3fb27SDimitry Andric       }
76006c3fb27SDimitry Andric     }
76106c3fb27SDimitry Andric   }
76206c3fb27SDimitry Andric }
76306c3fb27SDimitry Andric 
76406c3fb27SDimitry Andric // Do all processing for one canonicalized loop in the nest
76506c3fb27SDimitry Andric // (assuming that outer loops already were processed):
76606c3fb27SDimitry Andric template <typename T>
76706c3fb27SDimitry Andric kmp_loop_nest_iv_t kmp_process_one_loop_XX(
76806c3fb27SDimitry Andric     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
76906c3fb27SDimitry Andric     /*in/out*/ bounds_info_internal_t *bounds_nest) {
77006c3fb27SDimitry Andric 
77106c3fb27SDimitry Andric   kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
77206c3fb27SDimitry Andric   kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
77306c3fb27SDimitry Andric   return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b));
77406c3fb27SDimitry Andric }
77506c3fb27SDimitry Andric 
77606c3fb27SDimitry Andric // Non-rectangular loop nest, canonicalized to use <= or >=.
77706c3fb27SDimitry Andric // Process loop nest to have a parallelepiped shape,
77806c3fb27SDimitry Andric // calculate biggest spans for IV's on all levels and calculate overall trip
77906c3fb27SDimitry Andric // count. "bounds_nest" has to be allocated per thread.
78006c3fb27SDimitry Andric // Returns overall trip count (for adjusted space).
78106c3fb27SDimitry Andric kmp_loop_nest_iv_t kmp_process_loop_nest(
78206c3fb27SDimitry Andric     /*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) {
78306c3fb27SDimitry Andric 
78406c3fb27SDimitry Andric   kmp_loop_nest_iv_t total = 1;
78506c3fb27SDimitry Andric 
78606c3fb27SDimitry Andric   for (kmp_index_t ind = 0; ind < n; ++ind) {
78706c3fb27SDimitry Andric     auto bounds = &(bounds_nest[ind]);
78806c3fb27SDimitry Andric     kmp_loop_nest_iv_t trip_count = 0;
78906c3fb27SDimitry Andric 
79006c3fb27SDimitry Andric     switch (bounds->b.loop_type) {
79106c3fb27SDimitry Andric     case loop_type_t::loop_type_int32:
79206c3fb27SDimitry Andric       trip_count = kmp_process_one_loop_XX<kmp_int32>(
79306c3fb27SDimitry Andric           /*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds),
79406c3fb27SDimitry Andric           /*in/out*/ bounds_nest);
79506c3fb27SDimitry Andric       break;
79606c3fb27SDimitry Andric     case loop_type_t::loop_type_uint32:
79706c3fb27SDimitry Andric       trip_count = kmp_process_one_loop_XX<kmp_uint32>(
79806c3fb27SDimitry Andric           /*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds),
79906c3fb27SDimitry Andric           /*in/out*/ bounds_nest);
80006c3fb27SDimitry Andric       break;
80106c3fb27SDimitry Andric     case loop_type_t::loop_type_int64:
80206c3fb27SDimitry Andric       trip_count = kmp_process_one_loop_XX<kmp_int64>(
80306c3fb27SDimitry Andric           /*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds),
80406c3fb27SDimitry Andric           /*in/out*/ bounds_nest);
80506c3fb27SDimitry Andric       break;
80606c3fb27SDimitry Andric     case loop_type_t::loop_type_uint64:
80706c3fb27SDimitry Andric       trip_count = kmp_process_one_loop_XX<kmp_uint64>(
80806c3fb27SDimitry Andric           /*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds),
80906c3fb27SDimitry Andric           /*in/out*/ bounds_nest);
81006c3fb27SDimitry Andric       break;
81106c3fb27SDimitry Andric     default:
81206c3fb27SDimitry Andric       KMP_ASSERT(false);
81306c3fb27SDimitry Andric     }
81406c3fb27SDimitry Andric     total *= trip_count;
81506c3fb27SDimitry Andric   }
81606c3fb27SDimitry Andric 
81706c3fb27SDimitry Andric   return total;
81806c3fb27SDimitry Andric }
81906c3fb27SDimitry Andric 
82006c3fb27SDimitry Andric //----------Calculate iterations (in the original or updated space)-----------
82106c3fb27SDimitry Andric 
82206c3fb27SDimitry Andric // Calculate number of iterations in original or updated space resulting in
82306c3fb27SDimitry Andric // original_ivs[ind] (only on this level, non-negative)
82406c3fb27SDimitry Andric // (not counting initial iteration)
82506c3fb27SDimitry Andric template <typename T>
82606c3fb27SDimitry Andric kmp_loop_nest_iv_t
82706c3fb27SDimitry Andric kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds,
82806c3fb27SDimitry Andric                                  const kmp_point_t original_ivs,
82906c3fb27SDimitry Andric                                  kmp_index_t ind) {
83006c3fb27SDimitry Andric 
83106c3fb27SDimitry Andric   kmp_loop_nest_iv_t iterations = 0;
83206c3fb27SDimitry Andric 
83306c3fb27SDimitry Andric   if (bounds->comparison == comparison_t::comp_less_or_eq) {
83406c3fb27SDimitry Andric     iterations =
83506c3fb27SDimitry Andric         (static_cast<T>(original_ivs[ind]) - bounds->lb0 -
83606c3fb27SDimitry Andric          bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) /
83706c3fb27SDimitry Andric         __kmp_abs(bounds->step);
83806c3fb27SDimitry Andric   } else {
83906c3fb27SDimitry Andric     KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq);
84006c3fb27SDimitry Andric     iterations = (bounds->lb0 +
84106c3fb27SDimitry Andric                   bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) -
84206c3fb27SDimitry Andric                   static_cast<T>(original_ivs[ind])) /
84306c3fb27SDimitry Andric                  __kmp_abs(bounds->step);
84406c3fb27SDimitry Andric   }
84506c3fb27SDimitry Andric 
84606c3fb27SDimitry Andric   return iterations;
84706c3fb27SDimitry Andric }
84806c3fb27SDimitry Andric 
84906c3fb27SDimitry Andric // Calculate number of iterations in the original or updated space resulting in
85006c3fb27SDimitry Andric // original_ivs[ind] (only on this level, non-negative)
85106c3fb27SDimitry Andric kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds,
85206c3fb27SDimitry Andric                                                  const kmp_point_t original_ivs,
85306c3fb27SDimitry Andric                                                  kmp_index_t ind) {
85406c3fb27SDimitry Andric 
85506c3fb27SDimitry Andric   switch (bounds->loop_type) {
85606c3fb27SDimitry Andric   case loop_type_t::loop_type_int32:
85706c3fb27SDimitry Andric     return kmp_calc_number_of_iterations_XX<kmp_int32>(
85806c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind);
85906c3fb27SDimitry Andric     break;
86006c3fb27SDimitry Andric   case loop_type_t::loop_type_uint32:
86106c3fb27SDimitry Andric     return kmp_calc_number_of_iterations_XX<kmp_uint32>(
86206c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind);
86306c3fb27SDimitry Andric     break;
86406c3fb27SDimitry Andric   case loop_type_t::loop_type_int64:
86506c3fb27SDimitry Andric     return kmp_calc_number_of_iterations_XX<kmp_int64>(
86606c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind);
86706c3fb27SDimitry Andric     break;
86806c3fb27SDimitry Andric   case loop_type_t::loop_type_uint64:
86906c3fb27SDimitry Andric     return kmp_calc_number_of_iterations_XX<kmp_uint64>(
87006c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind);
87106c3fb27SDimitry Andric     break;
87206c3fb27SDimitry Andric   default:
87306c3fb27SDimitry Andric     KMP_ASSERT(false);
87406c3fb27SDimitry Andric     return 0;
87506c3fb27SDimitry Andric   }
87606c3fb27SDimitry Andric }
87706c3fb27SDimitry Andric 
87806c3fb27SDimitry Andric //----------Calculate new iv corresponding to original ivs--------------------
87906c3fb27SDimitry Andric 
88006c3fb27SDimitry Andric // We got a point in the original loop nest.
88106c3fb27SDimitry Andric // Take updated bounds and calculate what new_iv will correspond to this point.
88206c3fb27SDimitry Andric // When we are getting original IVs from new_iv, we have to adjust to fit into
88306c3fb27SDimitry Andric // original loops bounds. Getting new_iv for the adjusted original IVs will help
88406c3fb27SDimitry Andric // with making more chunks non-empty.
88506c3fb27SDimitry Andric kmp_loop_nest_iv_t
88606c3fb27SDimitry Andric kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest,
88706c3fb27SDimitry Andric                                   const kmp_point_t original_ivs,
88806c3fb27SDimitry Andric                                   kmp_index_t n) {
88906c3fb27SDimitry Andric 
89006c3fb27SDimitry Andric   kmp_loop_nest_iv_t new_iv = 0;
89106c3fb27SDimitry Andric 
89206c3fb27SDimitry Andric   for (kmp_index_t ind = 0; ind < n; ++ind) {
89306c3fb27SDimitry Andric     auto bounds = &(bounds_nest[ind].b);
89406c3fb27SDimitry Andric 
89506c3fb27SDimitry Andric     new_iv = new_iv * bounds->trip_count +
89606c3fb27SDimitry Andric              kmp_calc_number_of_iterations(bounds, original_ivs, ind);
89706c3fb27SDimitry Andric   }
89806c3fb27SDimitry Andric 
89906c3fb27SDimitry Andric   return new_iv;
90006c3fb27SDimitry Andric }
90106c3fb27SDimitry Andric 
90206c3fb27SDimitry Andric //----------Calculate original ivs for provided iterations--------------------
90306c3fb27SDimitry Andric 
90406c3fb27SDimitry Andric // Calculate original IVs for provided iterations, assuming iterations are
90506c3fb27SDimitry Andric // calculated in the original space.
90606c3fb27SDimitry Andric // Loop nest is in canonical form (with <= / >=).
90706c3fb27SDimitry Andric bool kmp_calc_original_ivs_from_iterations(
90806c3fb27SDimitry Andric     const bounds_info_t *original_bounds_nest, kmp_index_t n,
90906c3fb27SDimitry Andric     /*in/out*/ kmp_point_t original_ivs,
91006c3fb27SDimitry Andric     /*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) {
91106c3fb27SDimitry Andric 
91206c3fb27SDimitry Andric   kmp_index_t lengthened_ind = n;
91306c3fb27SDimitry Andric 
91406c3fb27SDimitry Andric   for (; ind < n;) {
91506c3fb27SDimitry Andric     auto bounds = &(original_bounds_nest[ind]);
91606c3fb27SDimitry Andric     bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations,
91706c3fb27SDimitry Andric                                 ind, (lengthened_ind < ind), true);
91806c3fb27SDimitry Andric 
91906c3fb27SDimitry Andric     if (!good) {
92006c3fb27SDimitry Andric       // The calculated iv value is too big (or too small for >=):
92106c3fb27SDimitry Andric       if (ind == 0) {
92206c3fb27SDimitry Andric         // Space is empty:
92306c3fb27SDimitry Andric         return false;
92406c3fb27SDimitry Andric       } else {
92506c3fb27SDimitry Andric         // Go to next iteration on the outer loop:
92606c3fb27SDimitry Andric         --ind;
92706c3fb27SDimitry Andric         ++iterations[ind];
92806c3fb27SDimitry Andric         lengthened_ind = ind;
92906c3fb27SDimitry Andric         for (kmp_index_t i = ind + 1; i < n; ++i) {
93006c3fb27SDimitry Andric           iterations[i] = 0;
93106c3fb27SDimitry Andric         }
93206c3fb27SDimitry Andric         continue;
93306c3fb27SDimitry Andric       }
93406c3fb27SDimitry Andric     }
93506c3fb27SDimitry Andric     ++ind;
93606c3fb27SDimitry Andric   }
93706c3fb27SDimitry Andric 
93806c3fb27SDimitry Andric   return true;
93906c3fb27SDimitry Andric }
94006c3fb27SDimitry Andric 
94106c3fb27SDimitry Andric //----------Calculate original ivs for the beginning of the loop nest---------
94206c3fb27SDimitry Andric 
94306c3fb27SDimitry Andric // Calculate IVs for the beginning of the loop nest.
94406c3fb27SDimitry Andric // Note: lower bounds of all loops may not work -
94506c3fb27SDimitry Andric // if on some of the iterations of the outer loops inner loops are empty.
94606c3fb27SDimitry Andric // Loop nest is in canonical form (with <= / >=).
94706c3fb27SDimitry Andric bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
94806c3fb27SDimitry Andric                                      kmp_index_t n,
94906c3fb27SDimitry Andric                                      /*out*/ kmp_point_t original_ivs) {
95006c3fb27SDimitry Andric 
95106c3fb27SDimitry Andric   // Iterations in the original space, multiplied by step:
9525f757f3fSDimitry Andric   CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
95306c3fb27SDimitry Andric   for (kmp_index_t ind = n; ind > 0;) {
95406c3fb27SDimitry Andric     --ind;
95506c3fb27SDimitry Andric     iterations[ind] = 0;
95606c3fb27SDimitry Andric   }
95706c3fb27SDimitry Andric 
95806c3fb27SDimitry Andric   // Now calculate the point:
95906c3fb27SDimitry Andric   bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,
96006c3fb27SDimitry Andric                                                  /*in/out*/ original_ivs,
96106c3fb27SDimitry Andric                                                  /*in/out*/ iterations, 0);
96206c3fb27SDimitry Andric   return b;
96306c3fb27SDimitry Andric }
96406c3fb27SDimitry Andric 
96506c3fb27SDimitry Andric //----------Calculate next point in the original loop space-------------------
96606c3fb27SDimitry Andric 
96706c3fb27SDimitry Andric // From current set of original IVs calculate next point.
96806c3fb27SDimitry Andric // Return false if there is no next point in the loop bounds.
96906c3fb27SDimitry Andric bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
97006c3fb27SDimitry Andric                                 kmp_index_t n, const kmp_point_t original_ivs,
97106c3fb27SDimitry Andric                                 /*out*/ kmp_point_t next_original_ivs) {
97206c3fb27SDimitry Andric   // Iterations in the original space, multiplied by step (so can be negative):
9735f757f3fSDimitry Andric   CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
97406c3fb27SDimitry Andric   // First, calc corresponding iteration in every original loop:
97506c3fb27SDimitry Andric   for (kmp_index_t ind = 0; ind < n; ++ind) {
97606c3fb27SDimitry Andric     auto bounds = &(original_bounds_nest[ind]);
97706c3fb27SDimitry Andric     iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind);
97806c3fb27SDimitry Andric   }
97906c3fb27SDimitry Andric 
98006c3fb27SDimitry Andric   for (kmp_index_t ind = 0; ind < n; ++ind) {
98106c3fb27SDimitry Andric     next_original_ivs[ind] = original_ivs[ind];
98206c3fb27SDimitry Andric   }
98306c3fb27SDimitry Andric 
98406c3fb27SDimitry Andric   // Next add one step to the iterations on the inner-most level, and see if we
98506c3fb27SDimitry Andric   // need to move up the nest:
98606c3fb27SDimitry Andric   kmp_index_t ind = n - 1;
98706c3fb27SDimitry Andric   ++iterations[ind];
98806c3fb27SDimitry Andric 
98906c3fb27SDimitry Andric   bool b = kmp_calc_original_ivs_from_iterations(
99006c3fb27SDimitry Andric       original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind);
99106c3fb27SDimitry Andric 
99206c3fb27SDimitry Andric   return b;
99306c3fb27SDimitry Andric }
99406c3fb27SDimitry Andric 
99506c3fb27SDimitry Andric //----------Calculate chunk end in the original loop space--------------------
99606c3fb27SDimitry Andric 
99706c3fb27SDimitry Andric // For one level calculate old induction variable corresponding to overall
99806c3fb27SDimitry Andric // new_iv for the chunk end.
99906c3fb27SDimitry Andric // Return true if it fits into upper bound on this level
100006c3fb27SDimitry Andric // (if not, we need to re-calculate)
100106c3fb27SDimitry Andric template <typename T>
100206c3fb27SDimitry Andric bool kmp_calc_one_iv_for_chunk_end_XX(
100306c3fb27SDimitry Andric     const bounds_infoXX_template<T> *bounds,
100406c3fb27SDimitry Andric     const bounds_infoXX_template<T> *updated_bounds,
100506c3fb27SDimitry Andric     /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations,
100606c3fb27SDimitry Andric     kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start,
100706c3fb27SDimitry Andric     const kmp_point_t original_ivs_start) {
100806c3fb27SDimitry Andric 
100906c3fb27SDimitry Andric   // typedef  std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
101006c3fb27SDimitry Andric   // big_span_t;
101106c3fb27SDimitry Andric 
101206c3fb27SDimitry Andric   // OMPTODO: is it good enough, or do we need ST or do we need big_span_t?
101306c3fb27SDimitry Andric   T temp = 0;
101406c3fb27SDimitry Andric 
101506c3fb27SDimitry Andric   T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
101606c3fb27SDimitry Andric 
101706c3fb27SDimitry Andric   if (start_with_lower_bound) {
101806c3fb27SDimitry Andric     // we moved to the next iteration on one of outer loops, may as well use
101906c3fb27SDimitry Andric     // the lower bound here:
102006c3fb27SDimitry Andric     temp = bounds->lb0 + bounds->lb1 * outer_iv;
102106c3fb27SDimitry Andric   } else {
102206c3fb27SDimitry Andric     // Start in expanded space, but:
102306c3fb27SDimitry Andric     // - we need to hit original space lower bound, so need to account for
102406c3fb27SDimitry Andric     // that
102506c3fb27SDimitry Andric     // - we have to go into original space, even if that means adding more
102606c3fb27SDimitry Andric     // iterations than was planned
102706c3fb27SDimitry Andric     // - we have to go past (or equal to) previous point (which is the chunk
102806c3fb27SDimitry Andric     // starting point)
102906c3fb27SDimitry Andric 
103006c3fb27SDimitry Andric     auto iteration = iterations[ind];
103106c3fb27SDimitry Andric 
103206c3fb27SDimitry Andric     auto step = bounds->step;
103306c3fb27SDimitry Andric 
103406c3fb27SDimitry Andric     // In case of >= it's negative:
103506c3fb27SDimitry Andric     auto accountForStep =
103606c3fb27SDimitry Andric         ((bounds->lb0 + bounds->lb1 * outer_iv) -
103706c3fb27SDimitry Andric          (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) %
103806c3fb27SDimitry Andric         step;
103906c3fb27SDimitry Andric 
104006c3fb27SDimitry Andric     temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv +
104106c3fb27SDimitry Andric            accountForStep + iteration * step;
104206c3fb27SDimitry Andric 
104306c3fb27SDimitry Andric     if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
104406c3fb27SDimitry Andric          (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) ||
104506c3fb27SDimitry Andric         ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
104606c3fb27SDimitry Andric          (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) {
104706c3fb27SDimitry Andric       // Too small (or too big), didn't reach the original lower bound. Use
104806c3fb27SDimitry Andric       // heuristic:
104906c3fb27SDimitry Andric       temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step;
105006c3fb27SDimitry Andric     }
105106c3fb27SDimitry Andric 
105206c3fb27SDimitry Andric     if (compare_with_start) {
105306c3fb27SDimitry Andric 
105406c3fb27SDimitry Andric       T start = static_cast<T>(original_ivs_start[ind]);
105506c3fb27SDimitry Andric 
105606c3fb27SDimitry Andric       temp = kmp_fix_iv(bounds->loop_iv_type, temp);
105706c3fb27SDimitry Andric 
105806c3fb27SDimitry Andric       // On all previous levels start of the chunk is same as the end, need to
105906c3fb27SDimitry Andric       // be really careful here:
106006c3fb27SDimitry Andric       if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
106106c3fb27SDimitry Andric            (temp < start)) ||
106206c3fb27SDimitry Andric           ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
106306c3fb27SDimitry Andric            (temp > start))) {
106406c3fb27SDimitry Andric         // End of the chunk can't be smaller (for >= bigger) than it's start.
106506c3fb27SDimitry Andric         // Use heuristic:
106606c3fb27SDimitry Andric         temp = start + iteration / 4 * step;
106706c3fb27SDimitry Andric       }
106806c3fb27SDimitry Andric     }
106906c3fb27SDimitry Andric   }
107006c3fb27SDimitry Andric 
107106c3fb27SDimitry Andric   original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp);
107206c3fb27SDimitry Andric 
107306c3fb27SDimitry Andric   if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
107406c3fb27SDimitry Andric        (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
107506c3fb27SDimitry Andric       ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
107606c3fb27SDimitry Andric        (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
107706c3fb27SDimitry Andric     // Too big (or too small for >=).
107806c3fb27SDimitry Andric     return false;
107906c3fb27SDimitry Andric   }
108006c3fb27SDimitry Andric 
108106c3fb27SDimitry Andric   return true;
108206c3fb27SDimitry Andric }
108306c3fb27SDimitry Andric 
108406c3fb27SDimitry Andric // For one level calculate old induction variable corresponding to overall
108506c3fb27SDimitry Andric // new_iv for the chunk end.
108606c3fb27SDimitry Andric bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds,
108706c3fb27SDimitry Andric                                    const bounds_info_t *updated_bounds,
108806c3fb27SDimitry Andric                                    /*in/out*/ kmp_point_t original_ivs,
108906c3fb27SDimitry Andric                                    const kmp_iterations_t iterations,
109006c3fb27SDimitry Andric                                    kmp_index_t ind, bool start_with_lower_bound,
109106c3fb27SDimitry Andric                                    bool compare_with_start,
109206c3fb27SDimitry Andric                                    const kmp_point_t original_ivs_start) {
109306c3fb27SDimitry Andric 
109406c3fb27SDimitry Andric   switch (bounds->loop_type) {
109506c3fb27SDimitry Andric   case loop_type_t::loop_type_int32:
109606c3fb27SDimitry Andric     return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>(
109706c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int32> *)(bounds),
109806c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int32> *)(updated_bounds),
109906c3fb27SDimitry Andric         /*in/out*/
110006c3fb27SDimitry Andric         original_ivs, iterations, ind, start_with_lower_bound,
110106c3fb27SDimitry Andric         compare_with_start, original_ivs_start);
110206c3fb27SDimitry Andric     break;
110306c3fb27SDimitry Andric   case loop_type_t::loop_type_uint32:
110406c3fb27SDimitry Andric     return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>(
110506c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint32> *)(bounds),
110606c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint32> *)(updated_bounds),
110706c3fb27SDimitry Andric         /*in/out*/
110806c3fb27SDimitry Andric         original_ivs, iterations, ind, start_with_lower_bound,
110906c3fb27SDimitry Andric         compare_with_start, original_ivs_start);
111006c3fb27SDimitry Andric     break;
111106c3fb27SDimitry Andric   case loop_type_t::loop_type_int64:
111206c3fb27SDimitry Andric     return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>(
111306c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int64> *)(bounds),
111406c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int64> *)(updated_bounds),
111506c3fb27SDimitry Andric         /*in/out*/
111606c3fb27SDimitry Andric         original_ivs, iterations, ind, start_with_lower_bound,
111706c3fb27SDimitry Andric         compare_with_start, original_ivs_start);
111806c3fb27SDimitry Andric     break;
111906c3fb27SDimitry Andric   case loop_type_t::loop_type_uint64:
112006c3fb27SDimitry Andric     return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>(
112106c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint64> *)(bounds),
112206c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint64> *)(updated_bounds),
112306c3fb27SDimitry Andric         /*in/out*/
112406c3fb27SDimitry Andric         original_ivs, iterations, ind, start_with_lower_bound,
112506c3fb27SDimitry Andric         compare_with_start, original_ivs_start);
112606c3fb27SDimitry Andric     break;
112706c3fb27SDimitry Andric   default:
112806c3fb27SDimitry Andric     KMP_ASSERT(false);
112906c3fb27SDimitry Andric     return false;
113006c3fb27SDimitry Andric   }
113106c3fb27SDimitry Andric }
113206c3fb27SDimitry Andric 
113306c3fb27SDimitry Andric // Calculate old induction variables corresponding to overall new_iv for the
113406c3fb27SDimitry Andric // chunk end. If due to space extension we are getting old IVs outside of the
113506c3fb27SDimitry Andric // boundaries, bring them into the boundaries. Need to do this in the runtime,
113606c3fb27SDimitry Andric // esp. on the lower bounds side. When getting result need to make sure that the
113706c3fb27SDimitry Andric // new chunk starts at next position to old chunk, not overlaps with it (this is
113806c3fb27SDimitry Andric // done elsewhere), and need to make sure end of the chunk is further than the
113906c3fb27SDimitry Andric // beginning of the chunk. We don't need an exact ending point here, just
114006c3fb27SDimitry Andric // something more-or-less close to the desired chunk length, bigger is fine
114106c3fb27SDimitry Andric // (smaller would be fine, but we risk going into infinite loop, so do smaller
114206c3fb27SDimitry Andric // only at the very end of the space). result: false if could not find the
114306c3fb27SDimitry Andric // ending point in the original loop space. In this case the caller can use
114406c3fb27SDimitry Andric // original upper bounds as the end of the chunk. Chunk won't be empty, because
114506c3fb27SDimitry Andric // it'll have at least the starting point, which is by construction in the
114606c3fb27SDimitry Andric // original space.
114706c3fb27SDimitry Andric bool kmp_calc_original_ivs_for_chunk_end(
114806c3fb27SDimitry Andric     const bounds_info_t *original_bounds_nest, kmp_index_t n,
114906c3fb27SDimitry Andric     const bounds_info_internal_t *updated_bounds_nest,
115006c3fb27SDimitry Andric     const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv,
115106c3fb27SDimitry Andric     /*out*/ kmp_point_t original_ivs) {
115206c3fb27SDimitry Andric 
115306c3fb27SDimitry Andric   // Iterations in the expanded space:
11545f757f3fSDimitry Andric   CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
115506c3fb27SDimitry Andric   // First, calc corresponding iteration in every modified loop:
115606c3fb27SDimitry Andric   for (kmp_index_t ind = n; ind > 0;) {
115706c3fb27SDimitry Andric     --ind;
115806c3fb27SDimitry Andric     auto &updated_bounds = updated_bounds_nest[ind];
115906c3fb27SDimitry Andric 
116006c3fb27SDimitry Andric     // should be optimized to OPDIVREM:
116106c3fb27SDimitry Andric     auto new_ind = new_iv / updated_bounds.b.trip_count;
116206c3fb27SDimitry Andric     auto iteration = new_iv % updated_bounds.b.trip_count;
116306c3fb27SDimitry Andric 
116406c3fb27SDimitry Andric     new_iv = new_ind;
116506c3fb27SDimitry Andric     iterations[ind] = iteration;
116606c3fb27SDimitry Andric   }
116706c3fb27SDimitry Andric   KMP_DEBUG_ASSERT(new_iv == 0);
116806c3fb27SDimitry Andric 
116906c3fb27SDimitry Andric   kmp_index_t lengthened_ind = n;
117006c3fb27SDimitry Andric   kmp_index_t equal_ind = -1;
117106c3fb27SDimitry Andric 
117206c3fb27SDimitry Andric   // Next calculate the point, but in original loop nest.
117306c3fb27SDimitry Andric   for (kmp_index_t ind = 0; ind < n;) {
117406c3fb27SDimitry Andric     auto bounds = &(original_bounds_nest[ind]);
117506c3fb27SDimitry Andric     auto updated_bounds = &(updated_bounds_nest[ind].b);
117606c3fb27SDimitry Andric 
117706c3fb27SDimitry Andric     bool good = kmp_calc_one_iv_for_chunk_end(
117806c3fb27SDimitry Andric         bounds, updated_bounds,
117906c3fb27SDimitry Andric         /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind),
118006c3fb27SDimitry Andric         (equal_ind >= ind - 1), original_ivs_start);
118106c3fb27SDimitry Andric 
118206c3fb27SDimitry Andric     if (!good) {
118306c3fb27SDimitry Andric       // Too big (or too small for >=).
118406c3fb27SDimitry Andric       if (ind == 0) {
118506c3fb27SDimitry Andric         // Need to reduce to the end.
118606c3fb27SDimitry Andric         return false;
118706c3fb27SDimitry Andric       } else {
118806c3fb27SDimitry Andric         // Go to next iteration on outer loop:
118906c3fb27SDimitry Andric         --ind;
119006c3fb27SDimitry Andric         ++(iterations[ind]);
119106c3fb27SDimitry Andric         lengthened_ind = ind;
119206c3fb27SDimitry Andric         if (equal_ind >= lengthened_ind) {
119306c3fb27SDimitry Andric           // We've changed the number of iterations here,
119406c3fb27SDimitry Andric           // can't be same anymore:
119506c3fb27SDimitry Andric           equal_ind = lengthened_ind - 1;
119606c3fb27SDimitry Andric         }
119706c3fb27SDimitry Andric         for (kmp_index_t i = ind + 1; i < n; ++i) {
119806c3fb27SDimitry Andric           iterations[i] = 0;
119906c3fb27SDimitry Andric         }
120006c3fb27SDimitry Andric         continue;
120106c3fb27SDimitry Andric       }
120206c3fb27SDimitry Andric     }
120306c3fb27SDimitry Andric 
120406c3fb27SDimitry Andric     if ((equal_ind == ind - 1) &&
120506c3fb27SDimitry Andric         (kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
120606c3fb27SDimitry Andric                     original_ivs_start[ind]))) {
120706c3fb27SDimitry Andric       equal_ind = ind;
120806c3fb27SDimitry Andric     } else if ((equal_ind > ind - 1) &&
120906c3fb27SDimitry Andric                !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
121006c3fb27SDimitry Andric                             original_ivs_start[ind]))) {
121106c3fb27SDimitry Andric       equal_ind = ind - 1;
121206c3fb27SDimitry Andric     }
121306c3fb27SDimitry Andric     ++ind;
121406c3fb27SDimitry Andric   }
121506c3fb27SDimitry Andric 
121606c3fb27SDimitry Andric   return true;
121706c3fb27SDimitry Andric }
121806c3fb27SDimitry Andric 
121906c3fb27SDimitry Andric //----------Calculate upper bounds for the last chunk-------------------------
122006c3fb27SDimitry Andric 
122106c3fb27SDimitry Andric // Calculate one upper bound for the end.
122206c3fb27SDimitry Andric template <typename T>
122306c3fb27SDimitry Andric void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds,
122406c3fb27SDimitry Andric                             /*in/out*/ kmp_point_t original_ivs,
122506c3fb27SDimitry Andric                             kmp_index_t ind) {
122606c3fb27SDimitry Andric 
122706c3fb27SDimitry Andric   T temp = bounds->ub0 +
122806c3fb27SDimitry Andric            bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]);
122906c3fb27SDimitry Andric 
123006c3fb27SDimitry Andric   original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
123106c3fb27SDimitry Andric }
123206c3fb27SDimitry Andric 
123306c3fb27SDimitry Andric void kmp_calc_one_iv_end(const bounds_info_t *bounds,
123406c3fb27SDimitry Andric                          /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) {
123506c3fb27SDimitry Andric 
123606c3fb27SDimitry Andric   switch (bounds->loop_type) {
123706c3fb27SDimitry Andric   default:
123806c3fb27SDimitry Andric     KMP_ASSERT(false);
123906c3fb27SDimitry Andric     break;
124006c3fb27SDimitry Andric   case loop_type_t::loop_type_int32:
124106c3fb27SDimitry Andric     kmp_calc_one_iv_end_XX<kmp_int32>(
124206c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int32> *)(bounds),
124306c3fb27SDimitry Andric         /*in/out*/ original_ivs, ind);
124406c3fb27SDimitry Andric     break;
124506c3fb27SDimitry Andric   case loop_type_t::loop_type_uint32:
124606c3fb27SDimitry Andric     kmp_calc_one_iv_end_XX<kmp_uint32>(
124706c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint32> *)(bounds),
124806c3fb27SDimitry Andric         /*in/out*/ original_ivs, ind);
124906c3fb27SDimitry Andric     break;
125006c3fb27SDimitry Andric   case loop_type_t::loop_type_int64:
125106c3fb27SDimitry Andric     kmp_calc_one_iv_end_XX<kmp_int64>(
125206c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int64> *)(bounds),
125306c3fb27SDimitry Andric         /*in/out*/ original_ivs, ind);
125406c3fb27SDimitry Andric     break;
125506c3fb27SDimitry Andric   case loop_type_t::loop_type_uint64:
125606c3fb27SDimitry Andric     kmp_calc_one_iv_end_XX<kmp_uint64>(
125706c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint64> *)(bounds),
125806c3fb27SDimitry Andric         /*in/out*/ original_ivs, ind);
125906c3fb27SDimitry Andric     break;
126006c3fb27SDimitry Andric   }
126106c3fb27SDimitry Andric }
126206c3fb27SDimitry Andric 
126306c3fb27SDimitry Andric // Calculate upper bounds for the last loop iteration. Just use original upper
126406c3fb27SDimitry Andric // bounds (adjusted when canonicalized to use <= / >=). No need to check that
126506c3fb27SDimitry Andric // this point is in the original space (it's likely not)
126606c3fb27SDimitry Andric void kmp_calc_original_ivs_for_end(
126706c3fb27SDimitry Andric     const bounds_info_t *const original_bounds_nest, kmp_index_t n,
126806c3fb27SDimitry Andric     /*out*/ kmp_point_t original_ivs) {
126906c3fb27SDimitry Andric   for (kmp_index_t ind = 0; ind < n; ++ind) {
127006c3fb27SDimitry Andric     auto bounds = &(original_bounds_nest[ind]);
127106c3fb27SDimitry Andric     kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind);
127206c3fb27SDimitry Andric   }
127306c3fb27SDimitry Andric }
127406c3fb27SDimitry Andric 
1275*0fca6ea1SDimitry Andric /**************************************************************************
1276*0fca6ea1SDimitry Andric  * Identify nested loop structure - loops come in the canonical form
1277*0fca6ea1SDimitry Andric  * Lower triangle matrix: i = 0; i <= N; i++        {0,0}:{N,0}
1278*0fca6ea1SDimitry Andric  *                        j = 0; j <= 0/-1+1*i; j++ {0,0}:{0/-1,1}
1279*0fca6ea1SDimitry Andric  * Upper Triangle matrix
1280*0fca6ea1SDimitry Andric  *                        i = 0;     i <= N; i++    {0,0}:{N,0}
1281*0fca6ea1SDimitry Andric  *                        j = 0+1*i; j <= N; j++    {0,1}:{N,0}
1282*0fca6ea1SDimitry Andric  * ************************************************************************/
1283*0fca6ea1SDimitry Andric nested_loop_type_t
1284*0fca6ea1SDimitry Andric kmp_identify_nested_loop_structure(/*in*/ bounds_info_t *original_bounds_nest,
1285*0fca6ea1SDimitry Andric                                    /*in*/ kmp_index_t n) {
1286*0fca6ea1SDimitry Andric   // only 2-level nested loops are supported
1287*0fca6ea1SDimitry Andric   if (n != 2) {
1288*0fca6ea1SDimitry Andric     return nested_loop_type_unkown;
1289*0fca6ea1SDimitry Andric   }
1290*0fca6ea1SDimitry Andric   // loops must be canonical
1291*0fca6ea1SDimitry Andric   KMP_ASSERT(
1292*0fca6ea1SDimitry Andric       (original_bounds_nest[0].comparison == comparison_t::comp_less_or_eq) &&
1293*0fca6ea1SDimitry Andric       (original_bounds_nest[1].comparison == comparison_t::comp_less_or_eq));
1294*0fca6ea1SDimitry Andric   // check outer loop bounds: for triangular need to be {0,0}:{N,0}
1295*0fca6ea1SDimitry Andric   kmp_uint64 outer_lb0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1296*0fca6ea1SDimitry Andric                                         original_bounds_nest[0].lb0_u64);
1297*0fca6ea1SDimitry Andric   kmp_uint64 outer_ub0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1298*0fca6ea1SDimitry Andric                                         original_bounds_nest[0].ub0_u64);
1299*0fca6ea1SDimitry Andric   kmp_uint64 outer_lb1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1300*0fca6ea1SDimitry Andric                                         original_bounds_nest[0].lb1_u64);
1301*0fca6ea1SDimitry Andric   kmp_uint64 outer_ub1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1302*0fca6ea1SDimitry Andric                                         original_bounds_nest[0].ub1_u64);
1303*0fca6ea1SDimitry Andric   if (outer_lb0_u64 != 0 || outer_lb1_u64 != 0 || outer_ub1_u64 != 0) {
1304*0fca6ea1SDimitry Andric     return nested_loop_type_unkown;
1305*0fca6ea1SDimitry Andric   }
1306*0fca6ea1SDimitry Andric   // check inner bounds to determine triangle type
1307*0fca6ea1SDimitry Andric   kmp_uint64 inner_lb0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1308*0fca6ea1SDimitry Andric                                         original_bounds_nest[1].lb0_u64);
1309*0fca6ea1SDimitry Andric   kmp_uint64 inner_ub0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1310*0fca6ea1SDimitry Andric                                         original_bounds_nest[1].ub0_u64);
1311*0fca6ea1SDimitry Andric   kmp_uint64 inner_lb1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1312*0fca6ea1SDimitry Andric                                         original_bounds_nest[1].lb1_u64);
1313*0fca6ea1SDimitry Andric   kmp_uint64 inner_ub1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1314*0fca6ea1SDimitry Andric                                         original_bounds_nest[1].ub1_u64);
1315*0fca6ea1SDimitry Andric   // lower triangle loop inner bounds need to be {0,0}:{0/-1,1}
1316*0fca6ea1SDimitry Andric   if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 &&
1317*0fca6ea1SDimitry Andric       (inner_ub0_u64 == 0 || inner_ub0_u64 == -1) && inner_ub1_u64 == 1) {
1318*0fca6ea1SDimitry Andric     return nested_loop_type_lower_triangular_matrix;
1319*0fca6ea1SDimitry Andric   }
1320*0fca6ea1SDimitry Andric   // upper triangle loop inner bounds need to be {0,1}:{N,0}
1321*0fca6ea1SDimitry Andric   if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 &&
1322*0fca6ea1SDimitry Andric       inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0) {
1323*0fca6ea1SDimitry Andric     return nested_loop_type_upper_triangular_matrix;
1324*0fca6ea1SDimitry Andric   }
1325*0fca6ea1SDimitry Andric   return nested_loop_type_unkown;
1326*0fca6ea1SDimitry Andric }
1327*0fca6ea1SDimitry Andric 
1328*0fca6ea1SDimitry Andric /**************************************************************************
1329*0fca6ea1SDimitry Andric  * SQRT Approximation: https://math.mit.edu/~stevenj/18.335/newton-sqrt.pdf
1330*0fca6ea1SDimitry Andric  * Start point is x so the result is always > sqrt(x)
1331*0fca6ea1SDimitry Andric  * The method has uniform convergence, PRECISION is set to 0.1
1332*0fca6ea1SDimitry Andric  * ************************************************************************/
1333*0fca6ea1SDimitry Andric #define level_of_precision 0.1
1334*0fca6ea1SDimitry Andric double sqrt_newton_approx(/*in*/ kmp_uint64 x) {
1335*0fca6ea1SDimitry Andric   double sqrt_old = 0.;
1336*0fca6ea1SDimitry Andric   double sqrt_new = (double)x;
1337*0fca6ea1SDimitry Andric   do {
1338*0fca6ea1SDimitry Andric     sqrt_old = sqrt_new;
1339*0fca6ea1SDimitry Andric     sqrt_new = (sqrt_old + x / sqrt_old) / 2;
1340*0fca6ea1SDimitry Andric   } while ((sqrt_old - sqrt_new) > level_of_precision);
1341*0fca6ea1SDimitry Andric   return sqrt_new;
1342*0fca6ea1SDimitry Andric }
1343*0fca6ea1SDimitry Andric 
1344*0fca6ea1SDimitry Andric /**************************************************************************
1345*0fca6ea1SDimitry Andric  *  Handle lower triangle matrix in the canonical form
1346*0fca6ea1SDimitry Andric  *  i = 0; i <= N; i++          {0,0}:{N,0}
1347*0fca6ea1SDimitry Andric  *  j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1}
1348*0fca6ea1SDimitry Andric  * ************************************************************************/
1349*0fca6ea1SDimitry Andric void kmp_handle_lower_triangle_matrix(
1350*0fca6ea1SDimitry Andric     /*in*/ kmp_uint32 nth,
1351*0fca6ea1SDimitry Andric     /*in*/ kmp_uint32 tid,
1352*0fca6ea1SDimitry Andric     /*in */ kmp_index_t n,
1353*0fca6ea1SDimitry Andric     /*in/out*/ bounds_info_t *original_bounds_nest,
1354*0fca6ea1SDimitry Andric     /*out*/ bounds_info_t *chunk_bounds_nest) {
1355*0fca6ea1SDimitry Andric 
1356*0fca6ea1SDimitry Andric   // transfer loop types from the original loop to the chunks
1357*0fca6ea1SDimitry Andric   for (kmp_index_t i = 0; i < n; ++i) {
1358*0fca6ea1SDimitry Andric     chunk_bounds_nest[i] = original_bounds_nest[i];
1359*0fca6ea1SDimitry Andric   }
1360*0fca6ea1SDimitry Andric   // cleanup iv variables
1361*0fca6ea1SDimitry Andric   kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1362*0fca6ea1SDimitry Andric                                     original_bounds_nest[0].ub0_u64);
1363*0fca6ea1SDimitry Andric   kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1364*0fca6ea1SDimitry Andric                                     original_bounds_nest[0].lb0_u64);
1365*0fca6ea1SDimitry Andric   kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1366*0fca6ea1SDimitry Andric                                     original_bounds_nest[1].ub0_u64);
1367*0fca6ea1SDimitry Andric   // calculate the chunk's lower and upper bounds
1368*0fca6ea1SDimitry Andric   // the total number of iterations in the loop is the sum of the arithmetic
1369*0fca6ea1SDimitry Andric   // progression from the outer lower to outer upper bound (inclusive since the
1370*0fca6ea1SDimitry Andric   // loop is canonical) note that less_than inner loops (inner_ub0 = -1)
1371*0fca6ea1SDimitry Andric   // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
1372*0fca6ea1SDimitry Andric   // + 1) -> N - 1
1373*0fca6ea1SDimitry Andric   kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1) + inner_ub0;
1374*0fca6ea1SDimitry Andric   kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
1375*0fca6ea1SDimitry Andric   // the current thread's number of iterations:
1376*0fca6ea1SDimitry Andric   // each thread gets an equal number of iterations: total number of iterations
1377*0fca6ea1SDimitry Andric   // divided by the number of threads plus, if there's a remainder,
1378*0fca6ea1SDimitry Andric   // the first threads with the number up to the remainder get an additional
1379*0fca6ea1SDimitry Andric   // iteration each to cover it
1380*0fca6ea1SDimitry Andric   kmp_uint64 iter_current =
1381*0fca6ea1SDimitry Andric       iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
1382*0fca6ea1SDimitry Andric   // cumulative number of iterations executed by all the previous threads:
1383*0fca6ea1SDimitry Andric   // threads with the tid below the remainder will have (iter_total/nth+1)
1384*0fca6ea1SDimitry Andric   // elements, and so will all threads before them so the cumulative number of
1385*0fca6ea1SDimitry Andric   // iterations executed by the all previous will be the current thread's number
1386*0fca6ea1SDimitry Andric   // of iterations multiplied by the number of previous threads which is equal
1387*0fca6ea1SDimitry Andric   // to the current thread's tid; threads with the number equal or above the
1388*0fca6ea1SDimitry Andric   // remainder will have (iter_total/nth) elements so the cumulative number of
1389*0fca6ea1SDimitry Andric   // iterations previously executed is its number of iterations multipled by the
1390*0fca6ea1SDimitry Andric   // number of previous threads which is again equal to the current thread's tid
1391*0fca6ea1SDimitry Andric   // PLUS all the remainder iterations that will have been executed by the
1392*0fca6ea1SDimitry Andric   // previous threads
1393*0fca6ea1SDimitry Andric   kmp_uint64 iter_before_current =
1394*0fca6ea1SDimitry Andric       tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
1395*0fca6ea1SDimitry Andric   // cumulative number of iterations executed with the current thread is
1396*0fca6ea1SDimitry Andric   // the cumulative number executed before it plus its own
1397*0fca6ea1SDimitry Andric   kmp_uint64 iter_with_current = iter_before_current + iter_current;
1398*0fca6ea1SDimitry Andric   // calculate the outer loop lower bound (lbo) which is the max outer iv value
1399*0fca6ea1SDimitry Andric   // that gives the number of iterations that is equal or just below the total
1400*0fca6ea1SDimitry Andric   // number of iterations executed by the previous threads, for less_than
1401*0fca6ea1SDimitry Andric   // (1-based) inner loops (inner_ub0 == -1) it will be i.e.
1402*0fca6ea1SDimitry Andric   // lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
1403*0fca6ea1SDimitry Andric   // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
1404*0fca6ea1SDimitry Andric   // i.e. lbo*(lbo+1)/2<=iter_before_current =>
1405*0fca6ea1SDimitry Andric   // lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily
1406*0fca6ea1SDimitry Andric   // using a parameter to control the equation sign
1407*0fca6ea1SDimitry Andric   kmp_int64 inner_adjustment = 1 + 2 * inner_ub0;
1408*0fca6ea1SDimitry Andric   kmp_uint64 lower_bound_outer =
1409*0fca6ea1SDimitry Andric       (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
1410*0fca6ea1SDimitry Andric                                       8 * iter_before_current) +
1411*0fca6ea1SDimitry Andric                    inner_adjustment) /
1412*0fca6ea1SDimitry Andric           2 -
1413*0fca6ea1SDimitry Andric       inner_adjustment;
1414*0fca6ea1SDimitry Andric   // calculate the inner loop lower bound which is the remaining number of
1415*0fca6ea1SDimitry Andric   // iterations required to hit the total number of iterations executed by the
1416*0fca6ea1SDimitry Andric   // previous threads giving the starting point of this thread
1417*0fca6ea1SDimitry Andric   kmp_uint64 lower_bound_inner =
1418*0fca6ea1SDimitry Andric       iter_before_current -
1419*0fca6ea1SDimitry Andric       ((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2;
1420*0fca6ea1SDimitry Andric   // calculate the outer loop upper bound using the same approach as for the
1421*0fca6ea1SDimitry Andric   // inner bound except using the total number of iterations executed with the
1422*0fca6ea1SDimitry Andric   // current thread
1423*0fca6ea1SDimitry Andric   kmp_uint64 upper_bound_outer =
1424*0fca6ea1SDimitry Andric       (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
1425*0fca6ea1SDimitry Andric                                       8 * iter_with_current) +
1426*0fca6ea1SDimitry Andric                    inner_adjustment) /
1427*0fca6ea1SDimitry Andric           2 -
1428*0fca6ea1SDimitry Andric       inner_adjustment;
1429*0fca6ea1SDimitry Andric   // calculate the inner loop upper bound which is the remaining number of
1430*0fca6ea1SDimitry Andric   // iterations required to hit the total number of iterations executed after
1431*0fca6ea1SDimitry Andric   // the current thread giving the starting point of the next thread
1432*0fca6ea1SDimitry Andric   kmp_uint64 upper_bound_inner =
1433*0fca6ea1SDimitry Andric       iter_with_current -
1434*0fca6ea1SDimitry Andric       ((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2;
1435*0fca6ea1SDimitry Andric   // adjust the upper bounds down by 1 element to point at the last iteration of
1436*0fca6ea1SDimitry Andric   // the current thread the first iteration of the next thread
1437*0fca6ea1SDimitry Andric   if (upper_bound_inner == 0) {
1438*0fca6ea1SDimitry Andric     // {n,0} => {n-1,n-1}
1439*0fca6ea1SDimitry Andric     upper_bound_outer -= 1;
1440*0fca6ea1SDimitry Andric     upper_bound_inner = upper_bound_outer;
1441*0fca6ea1SDimitry Andric   } else {
1442*0fca6ea1SDimitry Andric     // {n,m} => {n,m-1} (m!=0)
1443*0fca6ea1SDimitry Andric     upper_bound_inner -= 1;
1444*0fca6ea1SDimitry Andric   }
1445*0fca6ea1SDimitry Andric 
1446*0fca6ea1SDimitry Andric   // assign the values, zeroing out lb1 and ub1 values since the iteration space
1447*0fca6ea1SDimitry Andric   // is now one-dimensional
1448*0fca6ea1SDimitry Andric   chunk_bounds_nest[0].lb0_u64 = lower_bound_outer;
1449*0fca6ea1SDimitry Andric   chunk_bounds_nest[1].lb0_u64 = lower_bound_inner;
1450*0fca6ea1SDimitry Andric   chunk_bounds_nest[0].ub0_u64 = upper_bound_outer;
1451*0fca6ea1SDimitry Andric   chunk_bounds_nest[1].ub0_u64 = upper_bound_inner;
1452*0fca6ea1SDimitry Andric   chunk_bounds_nest[0].lb1_u64 = 0;
1453*0fca6ea1SDimitry Andric   chunk_bounds_nest[0].ub1_u64 = 0;
1454*0fca6ea1SDimitry Andric   chunk_bounds_nest[1].lb1_u64 = 0;
1455*0fca6ea1SDimitry Andric   chunk_bounds_nest[1].ub1_u64 = 0;
1456*0fca6ea1SDimitry Andric 
1457*0fca6ea1SDimitry Andric #if 0
1458*0fca6ea1SDimitry Andric   printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
1459*0fca6ea1SDimitry Andric          tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
1460*0fca6ea1SDimitry Andric          chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
1461*0fca6ea1SDimitry Andric #endif
1462*0fca6ea1SDimitry Andric }
1463*0fca6ea1SDimitry Andric 
1464*0fca6ea1SDimitry Andric /**************************************************************************
1465*0fca6ea1SDimitry Andric  *  Handle upper triangle matrix in the canonical form
1466*0fca6ea1SDimitry Andric  *  i = 0; i <= N; i++     {0,0}:{N,0}
1467*0fca6ea1SDimitry Andric  *  j = 0+1*i; j <= N; j++ {0,1}:{N,0}
1468*0fca6ea1SDimitry Andric  * ************************************************************************/
1469*0fca6ea1SDimitry Andric void kmp_handle_upper_triangle_matrix(
1470*0fca6ea1SDimitry Andric     /*in*/ kmp_uint32 nth,
1471*0fca6ea1SDimitry Andric     /*in*/ kmp_uint32 tid,
1472*0fca6ea1SDimitry Andric     /*in */ kmp_index_t n,
1473*0fca6ea1SDimitry Andric     /*in/out*/ bounds_info_t *original_bounds_nest,
1474*0fca6ea1SDimitry Andric     /*out*/ bounds_info_t *chunk_bounds_nest) {
1475*0fca6ea1SDimitry Andric 
1476*0fca6ea1SDimitry Andric   // transfer loop types from the original loop to the chunks
1477*0fca6ea1SDimitry Andric   for (kmp_index_t i = 0; i < n; ++i) {
1478*0fca6ea1SDimitry Andric     chunk_bounds_nest[i] = original_bounds_nest[i];
1479*0fca6ea1SDimitry Andric   }
1480*0fca6ea1SDimitry Andric   // cleanup iv variables
1481*0fca6ea1SDimitry Andric   kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1482*0fca6ea1SDimitry Andric                                     original_bounds_nest[0].ub0_u64);
1483*0fca6ea1SDimitry Andric   kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1484*0fca6ea1SDimitry Andric                                     original_bounds_nest[0].lb0_u64);
1485*0fca6ea1SDimitry Andric   [[maybe_unused]] kmp_uint64 inner_ub0 = kmp_fix_iv(
1486*0fca6ea1SDimitry Andric       original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].ub0_u64);
1487*0fca6ea1SDimitry Andric   // calculate the chunk's lower and upper bounds
1488*0fca6ea1SDimitry Andric   // the total number of iterations in the loop is the sum of the arithmetic
1489*0fca6ea1SDimitry Andric   // progression from the outer lower to outer upper bound (inclusive since the
1490*0fca6ea1SDimitry Andric   // loop is canonical) note that less_than inner loops (inner_ub0 = -1)
1491*0fca6ea1SDimitry Andric   // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
1492*0fca6ea1SDimitry Andric   // + 1) -> N - 1
1493*0fca6ea1SDimitry Andric   kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1);
1494*0fca6ea1SDimitry Andric   kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
1495*0fca6ea1SDimitry Andric   // the current thread's number of iterations:
1496*0fca6ea1SDimitry Andric   // each thread gets an equal number of iterations: total number of iterations
1497*0fca6ea1SDimitry Andric   // divided by the number of threads plus, if there's a remainder,
1498*0fca6ea1SDimitry Andric   // the first threads with the number up to the remainder get an additional
1499*0fca6ea1SDimitry Andric   // iteration each to cover it
1500*0fca6ea1SDimitry Andric   kmp_uint64 iter_current =
1501*0fca6ea1SDimitry Andric       iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
1502*0fca6ea1SDimitry Andric   // cumulative number of iterations executed by all the previous threads:
1503*0fca6ea1SDimitry Andric   // threads with the tid below the remainder will have (iter_total/nth+1)
1504*0fca6ea1SDimitry Andric   // elements, and so will all threads before them so the cumulative number of
1505*0fca6ea1SDimitry Andric   // iterations executed by the all previous will be the current thread's number
1506*0fca6ea1SDimitry Andric   // of iterations multiplied by the number of previous threads which is equal
1507*0fca6ea1SDimitry Andric   // to the current thread's tid; threads with the number equal or above the
1508*0fca6ea1SDimitry Andric   // remainder will have (iter_total/nth) elements so the cumulative number of
1509*0fca6ea1SDimitry Andric   // iterations previously executed is its number of iterations multipled by the
1510*0fca6ea1SDimitry Andric   // number of previous threads which is again equal to the current thread's tid
1511*0fca6ea1SDimitry Andric   // PLUS all the remainder iterations that will have been executed by the
1512*0fca6ea1SDimitry Andric   // previous threads
1513*0fca6ea1SDimitry Andric   kmp_uint64 iter_before_current =
1514*0fca6ea1SDimitry Andric       tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
1515*0fca6ea1SDimitry Andric   // cumulative number of iterations executed with the current thread is
1516*0fca6ea1SDimitry Andric   // the cumulative number executed before it plus its own
1517*0fca6ea1SDimitry Andric   kmp_uint64 iter_with_current = iter_before_current + iter_current;
1518*0fca6ea1SDimitry Andric   // calculate the outer loop lower bound (lbo) which is the max outer iv value
1519*0fca6ea1SDimitry Andric   // that gives the number of iterations that is equal or just below the total
1520*0fca6ea1SDimitry Andric   // number of iterations executed by the previous threads:
1521*0fca6ea1SDimitry Andric   // lbo*(lbo+1)/2<=iter_before_current =>
1522*0fca6ea1SDimitry Andric   // lbo^2+lbo-2*iter_before_current<=0
1523*0fca6ea1SDimitry Andric   kmp_uint64 lower_bound_outer =
1524*0fca6ea1SDimitry Andric       (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1;
1525*0fca6ea1SDimitry Andric   // calculate the inner loop lower bound which is the remaining number of
1526*0fca6ea1SDimitry Andric   // iterations required to hit the total number of iterations executed by the
1527*0fca6ea1SDimitry Andric   // previous threads giving the starting point of this thread
1528*0fca6ea1SDimitry Andric   kmp_uint64 lower_bound_inner =
1529*0fca6ea1SDimitry Andric       iter_before_current - ((lower_bound_outer + 1) * lower_bound_outer) / 2;
1530*0fca6ea1SDimitry Andric   // calculate the outer loop upper bound using the same approach as for the
1531*0fca6ea1SDimitry Andric   // inner bound except using the total number of iterations executed with the
1532*0fca6ea1SDimitry Andric   // current thread
1533*0fca6ea1SDimitry Andric   kmp_uint64 upper_bound_outer =
1534*0fca6ea1SDimitry Andric       (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_with_current) + 1) / 2 - 1;
1535*0fca6ea1SDimitry Andric   // calculate the inner loop upper bound which is the remaining number of
1536*0fca6ea1SDimitry Andric   // iterations required to hit the total number of iterations executed after
1537*0fca6ea1SDimitry Andric   // the current thread giving the starting point of the next thread
1538*0fca6ea1SDimitry Andric   kmp_uint64 upper_bound_inner =
1539*0fca6ea1SDimitry Andric       iter_with_current - ((upper_bound_outer + 1) * upper_bound_outer) / 2;
1540*0fca6ea1SDimitry Andric   // adjust the upper bounds down by 1 element to point at the last iteration of
1541*0fca6ea1SDimitry Andric   // the current thread the first iteration of the next thread
1542*0fca6ea1SDimitry Andric   if (upper_bound_inner == 0) {
1543*0fca6ea1SDimitry Andric     // {n,0} => {n-1,n-1}
1544*0fca6ea1SDimitry Andric     upper_bound_outer -= 1;
1545*0fca6ea1SDimitry Andric     upper_bound_inner = upper_bound_outer;
1546*0fca6ea1SDimitry Andric   } else {
1547*0fca6ea1SDimitry Andric     // {n,m} => {n,m-1} (m!=0)
1548*0fca6ea1SDimitry Andric     upper_bound_inner -= 1;
1549*0fca6ea1SDimitry Andric   }
1550*0fca6ea1SDimitry Andric 
1551*0fca6ea1SDimitry Andric   // assign the values, zeroing out lb1 and ub1 values since the iteration space
1552*0fca6ea1SDimitry Andric   // is now one-dimensional
1553*0fca6ea1SDimitry Andric   chunk_bounds_nest[0].lb0_u64 = (outer_iters - 1) - upper_bound_outer;
1554*0fca6ea1SDimitry Andric   chunk_bounds_nest[1].lb0_u64 = (outer_iters - 1) - upper_bound_inner;
1555*0fca6ea1SDimitry Andric   chunk_bounds_nest[0].ub0_u64 = (outer_iters - 1) - lower_bound_outer;
1556*0fca6ea1SDimitry Andric   chunk_bounds_nest[1].ub0_u64 = (outer_iters - 1) - lower_bound_inner;
1557*0fca6ea1SDimitry Andric   chunk_bounds_nest[0].lb1_u64 = 0;
1558*0fca6ea1SDimitry Andric   chunk_bounds_nest[0].ub1_u64 = 0;
1559*0fca6ea1SDimitry Andric   chunk_bounds_nest[1].lb1_u64 = 0;
1560*0fca6ea1SDimitry Andric   chunk_bounds_nest[1].ub1_u64 = 0;
1561*0fca6ea1SDimitry Andric 
1562*0fca6ea1SDimitry Andric #if 0
1563*0fca6ea1SDimitry Andric   printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
1564*0fca6ea1SDimitry Andric          tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
1565*0fca6ea1SDimitry Andric          chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
1566*0fca6ea1SDimitry Andric #endif
1567*0fca6ea1SDimitry Andric }
156806c3fb27SDimitry Andric //----------Init API for non-rectangular loops--------------------------------
156906c3fb27SDimitry Andric 
157006c3fb27SDimitry Andric // Init API for collapsed loops (static, no chunks defined).
157106c3fb27SDimitry Andric // "bounds_nest" has to be allocated per thread.
157206c3fb27SDimitry Andric // API will modify original bounds_nest array to bring it to a canonical form
157306c3fb27SDimitry Andric // (only <= and >=, no !=, <, >). If the original loop nest was already in a
157406c3fb27SDimitry Andric // canonical form there will be no changes to bounds in bounds_nest array
157506c3fb27SDimitry Andric // (only trip counts will be calculated). Internally API will expand the space
157606c3fb27SDimitry Andric // to parallelogram/parallelepiped, calculate total, calculate bounds for the
157706c3fb27SDimitry Andric // chunks in terms of the new IV, re-calc them in terms of old IVs (especially
157806c3fb27SDimitry Andric // important on the left side, to hit the lower bounds and not step over), and
157906c3fb27SDimitry Andric // pick the correct chunk for this thread (so it will calculate chunks up to the
158006c3fb27SDimitry Andric // needed one). It could be optimized to calculate just this chunk, potentially
158106c3fb27SDimitry Andric // a bit less well distributed among threads. It is designed to make sure that
158206c3fb27SDimitry Andric // threads will receive predictable chunks, deterministically (so that next nest
158306c3fb27SDimitry Andric // of loops with similar characteristics will get exactly same chunks on same
158406c3fb27SDimitry Andric // threads).
158506c3fb27SDimitry Andric // Current contract: chunk_bounds_nest has only lb0 and ub0,
158606c3fb27SDimitry Andric // lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
158706c3fb27SDimitry Andric extern "C" kmp_int32
158806c3fb27SDimitry Andric __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
158906c3fb27SDimitry Andric                           /*in/out*/ bounds_info_t *original_bounds_nest,
159006c3fb27SDimitry Andric                           /*out*/ bounds_info_t *chunk_bounds_nest,
159106c3fb27SDimitry Andric                           kmp_index_t n, /*out*/ kmp_int32 *plastiter) {
159206c3fb27SDimitry Andric 
159306c3fb27SDimitry Andric   KMP_DEBUG_ASSERT(plastiter && original_bounds_nest);
159406c3fb27SDimitry Andric   KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid));
159506c3fb27SDimitry Andric 
159606c3fb27SDimitry Andric   if (__kmp_env_consistency_check) {
159706c3fb27SDimitry Andric     __kmp_push_workshare(gtid, ct_pdo, loc);
159806c3fb27SDimitry Andric   }
159906c3fb27SDimitry Andric 
160006c3fb27SDimitry Andric   kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
160106c3fb27SDimitry Andric 
16025f757f3fSDimitry Andric   CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n);
160306c3fb27SDimitry Andric 
160406c3fb27SDimitry Andric   for (kmp_index_t i = 0; i < n; ++i) {
160506c3fb27SDimitry Andric     updated_bounds_nest[i].b = original_bounds_nest[i];
160606c3fb27SDimitry Andric   }
160706c3fb27SDimitry Andric 
160806c3fb27SDimitry Andric   kmp_loop_nest_iv_t total =
160906c3fb27SDimitry Andric       kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n);
161006c3fb27SDimitry Andric 
161106c3fb27SDimitry Andric   if (plastiter != NULL) {
161206c3fb27SDimitry Andric     *plastiter = FALSE;
161306c3fb27SDimitry Andric   }
161406c3fb27SDimitry Andric 
161506c3fb27SDimitry Andric   if (total == 0) {
161606c3fb27SDimitry Andric     // Loop won't execute:
161706c3fb27SDimitry Andric     return FALSE;
161806c3fb27SDimitry Andric   }
161906c3fb27SDimitry Andric 
162006c3fb27SDimitry Andric   // OMPTODO: DISTRIBUTE is not supported yet
162106c3fb27SDimitry Andric   __kmp_assert_valid_gtid(gtid);
162206c3fb27SDimitry Andric   kmp_uint32 tid = __kmp_tid_from_gtid(gtid);
162306c3fb27SDimitry Andric 
162406c3fb27SDimitry Andric   kmp_info_t *th = __kmp_threads[gtid];
162506c3fb27SDimitry Andric   kmp_team_t *team = th->th.th_team;
162606c3fb27SDimitry Andric   kmp_uint32 nth = team->t.t_nproc; // Number of threads
162706c3fb27SDimitry Andric 
162806c3fb27SDimitry Andric   KMP_DEBUG_ASSERT(tid < nth);
162906c3fb27SDimitry Andric 
1630*0fca6ea1SDimitry Andric   // Handle special cases
1631*0fca6ea1SDimitry Andric   nested_loop_type_t loop_type =
1632*0fca6ea1SDimitry Andric       kmp_identify_nested_loop_structure(original_bounds_nest, n);
1633*0fca6ea1SDimitry Andric   if (loop_type == nested_loop_type_lower_triangular_matrix) {
1634*0fca6ea1SDimitry Andric     kmp_handle_lower_triangle_matrix(nth, tid, n, original_bounds_nest,
1635*0fca6ea1SDimitry Andric                                      chunk_bounds_nest);
1636*0fca6ea1SDimitry Andric     return TRUE;
1637*0fca6ea1SDimitry Andric   } else if (loop_type == nested_loop_type_upper_triangular_matrix) {
1638*0fca6ea1SDimitry Andric     kmp_handle_upper_triangle_matrix(nth, tid, n, original_bounds_nest,
1639*0fca6ea1SDimitry Andric                                      chunk_bounds_nest);
1640*0fca6ea1SDimitry Andric     return TRUE;
1641*0fca6ea1SDimitry Andric   }
1642*0fca6ea1SDimitry Andric 
16435f757f3fSDimitry Andric   CollapseAllocator<kmp_uint64> original_ivs_start(n);
164406c3fb27SDimitry Andric 
164506c3fb27SDimitry Andric   if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
164606c3fb27SDimitry Andric                                        /*out*/ original_ivs_start)) {
164706c3fb27SDimitry Andric     // Loop won't execute:
164806c3fb27SDimitry Andric     return FALSE;
164906c3fb27SDimitry Andric   }
165006c3fb27SDimitry Andric 
165106c3fb27SDimitry Andric   // Not doing this optimization for one thread:
165206c3fb27SDimitry Andric   // (1) more to test
165306c3fb27SDimitry Andric   // (2) without it current contract that chunk_bounds_nest has only lb0 and
165406c3fb27SDimitry Andric   // ub0, lb1 and ub1 are set to 0 and can be ignored.
165506c3fb27SDimitry Andric   // if (nth == 1) {
165606c3fb27SDimitry Andric   //  // One thread:
165706c3fb27SDimitry Andric   //  // Copy all info from original_bounds_nest, it'll be good enough.
165806c3fb27SDimitry Andric 
165906c3fb27SDimitry Andric   //  for (kmp_index_t i = 0; i < n; ++i) {
166006c3fb27SDimitry Andric   //    chunk_bounds_nest[i] = original_bounds_nest[i];
166106c3fb27SDimitry Andric   //  }
166206c3fb27SDimitry Andric 
166306c3fb27SDimitry Andric   //  if (plastiter != NULL) {
166406c3fb27SDimitry Andric   //    *plastiter = TRUE;
166506c3fb27SDimitry Andric   //  }
166606c3fb27SDimitry Andric   //  return TRUE;
166706c3fb27SDimitry Andric   //}
166806c3fb27SDimitry Andric 
166906c3fb27SDimitry Andric   kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs(
167006c3fb27SDimitry Andric       updated_bounds_nest, original_ivs_start, n);
167106c3fb27SDimitry Andric 
167206c3fb27SDimitry Andric   bool last_iter = false;
167306c3fb27SDimitry Andric 
167406c3fb27SDimitry Andric   for (; nth > 0;) {
167506c3fb27SDimitry Andric     // We could calculate chunk size once, but this is to compensate that the
167606c3fb27SDimitry Andric     // original space is not parallelepiped and some threads can be left
167706c3fb27SDimitry Andric     // without work:
167806c3fb27SDimitry Andric     KMP_DEBUG_ASSERT(total >= new_iv);
167906c3fb27SDimitry Andric 
168006c3fb27SDimitry Andric     kmp_loop_nest_iv_t total_left = total - new_iv;
168106c3fb27SDimitry Andric     kmp_loop_nest_iv_t chunk_size = total_left / nth;
168206c3fb27SDimitry Andric     kmp_loop_nest_iv_t remainder = total_left % nth;
168306c3fb27SDimitry Andric 
168406c3fb27SDimitry Andric     kmp_loop_nest_iv_t curr_chunk_size = chunk_size;
168506c3fb27SDimitry Andric 
168606c3fb27SDimitry Andric     if (remainder > 0) {
168706c3fb27SDimitry Andric       ++curr_chunk_size;
168806c3fb27SDimitry Andric       --remainder;
168906c3fb27SDimitry Andric     }
169006c3fb27SDimitry Andric 
169106c3fb27SDimitry Andric #if defined(KMP_DEBUG)
169206c3fb27SDimitry Andric     kmp_loop_nest_iv_t new_iv_for_start = new_iv;
169306c3fb27SDimitry Andric #endif
169406c3fb27SDimitry Andric 
169506c3fb27SDimitry Andric     if (curr_chunk_size > 1) {
169606c3fb27SDimitry Andric       new_iv += curr_chunk_size - 1;
169706c3fb27SDimitry Andric     }
169806c3fb27SDimitry Andric 
16995f757f3fSDimitry Andric     CollapseAllocator<kmp_uint64> original_ivs_end(n);
170006c3fb27SDimitry Andric     if ((nth == 1) || (new_iv >= total - 1)) {
170106c3fb27SDimitry Andric       // Do this one till the end - just in case we miscalculated
170206c3fb27SDimitry Andric       // and either too much is left to process or new_iv is a bit too big:
170306c3fb27SDimitry Andric       kmp_calc_original_ivs_for_end(original_bounds_nest, n,
170406c3fb27SDimitry Andric                                     /*out*/ original_ivs_end);
170506c3fb27SDimitry Andric 
170606c3fb27SDimitry Andric       last_iter = true;
170706c3fb27SDimitry Andric     } else {
170806c3fb27SDimitry Andric       // Note: here we make sure it's past (or equal to) the previous point.
170906c3fb27SDimitry Andric       if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n,
171006c3fb27SDimitry Andric                                                updated_bounds_nest,
171106c3fb27SDimitry Andric                                                original_ivs_start, new_iv,
171206c3fb27SDimitry Andric                                                /*out*/ original_ivs_end)) {
171306c3fb27SDimitry Andric         // We could not find the ending point, use the original upper bounds:
171406c3fb27SDimitry Andric         kmp_calc_original_ivs_for_end(original_bounds_nest, n,
171506c3fb27SDimitry Andric                                       /*out*/ original_ivs_end);
171606c3fb27SDimitry Andric 
171706c3fb27SDimitry Andric         last_iter = true;
171806c3fb27SDimitry Andric       }
171906c3fb27SDimitry Andric     }
172006c3fb27SDimitry Andric 
172106c3fb27SDimitry Andric #if defined(KMP_DEBUG)
172206c3fb27SDimitry Andric     auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs(
172306c3fb27SDimitry Andric         updated_bounds_nest, original_ivs_end, n);
172406c3fb27SDimitry Andric     KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start);
172506c3fb27SDimitry Andric #endif
172606c3fb27SDimitry Andric 
172706c3fb27SDimitry Andric     if (last_iter && (tid != 0)) {
172806c3fb27SDimitry Andric       // We are done, this was last chunk, but no chunk for current thread was
172906c3fb27SDimitry Andric       // found:
173006c3fb27SDimitry Andric       return FALSE;
173106c3fb27SDimitry Andric     }
173206c3fb27SDimitry Andric 
173306c3fb27SDimitry Andric     if (tid == 0) {
173406c3fb27SDimitry Andric       // We found the chunk for this thread, now we need to check if it's the
173506c3fb27SDimitry Andric       // last chunk or not:
173606c3fb27SDimitry Andric 
17375f757f3fSDimitry Andric       CollapseAllocator<kmp_uint64> original_ivs_next_start(n);
173806c3fb27SDimitry Andric       if (last_iter ||
173906c3fb27SDimitry Andric           !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end,
174006c3fb27SDimitry Andric                                       /*out*/ original_ivs_next_start)) {
174106c3fb27SDimitry Andric         // no more loop iterations left to process,
174206c3fb27SDimitry Andric         // this means that currently found chunk is the last chunk:
174306c3fb27SDimitry Andric         if (plastiter != NULL) {
174406c3fb27SDimitry Andric           *plastiter = TRUE;
174506c3fb27SDimitry Andric         }
174606c3fb27SDimitry Andric       }
174706c3fb27SDimitry Andric 
174806c3fb27SDimitry Andric       // Fill in chunk bounds:
174906c3fb27SDimitry Andric       for (kmp_index_t i = 0; i < n; ++i) {
175006c3fb27SDimitry Andric         chunk_bounds_nest[i] =
175106c3fb27SDimitry Andric             original_bounds_nest[i]; // To fill in types, etc. - optional
175206c3fb27SDimitry Andric         chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i];
175306c3fb27SDimitry Andric         chunk_bounds_nest[i].lb1_u64 = 0;
175406c3fb27SDimitry Andric 
175506c3fb27SDimitry Andric         chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i];
175606c3fb27SDimitry Andric         chunk_bounds_nest[i].ub1_u64 = 0;
175706c3fb27SDimitry Andric       }
175806c3fb27SDimitry Andric 
175906c3fb27SDimitry Andric       return TRUE;
176006c3fb27SDimitry Andric     }
176106c3fb27SDimitry Andric 
176206c3fb27SDimitry Andric     --tid;
176306c3fb27SDimitry Andric     --nth;
176406c3fb27SDimitry Andric 
176506c3fb27SDimitry Andric     bool next_chunk = kmp_calc_next_original_ivs(
176606c3fb27SDimitry Andric         original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start);
176706c3fb27SDimitry Andric     if (!next_chunk) {
176806c3fb27SDimitry Andric       // no more loop iterations to process,
176906c3fb27SDimitry Andric       // the prevoius chunk was the last chunk
177006c3fb27SDimitry Andric       break;
177106c3fb27SDimitry Andric     }
177206c3fb27SDimitry Andric 
177306c3fb27SDimitry Andric     // original_ivs_start is next to previous chunk original_ivs_end,
177406c3fb27SDimitry Andric     // we need to start new chunk here, so chunks will be one after another
177506c3fb27SDimitry Andric     // without any gap or overlap:
177606c3fb27SDimitry Andric     new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest,
177706c3fb27SDimitry Andric                                                original_ivs_start, n);
177806c3fb27SDimitry Andric   }
177906c3fb27SDimitry Andric 
178006c3fb27SDimitry Andric   return FALSE;
178106c3fb27SDimitry Andric }
1782