106c3fb27SDimitry Andric /* 206c3fb27SDimitry Andric * kmp_collapse.cpp -- loop collapse feature 306c3fb27SDimitry Andric */ 406c3fb27SDimitry Andric 506c3fb27SDimitry Andric //===----------------------------------------------------------------------===// 606c3fb27SDimitry Andric // 706c3fb27SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 806c3fb27SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 906c3fb27SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 1006c3fb27SDimitry Andric // 1106c3fb27SDimitry Andric //===----------------------------------------------------------------------===// 1206c3fb27SDimitry Andric 1306c3fb27SDimitry Andric #include "kmp.h" 1406c3fb27SDimitry Andric #include "kmp_error.h" 1506c3fb27SDimitry Andric #include "kmp_i18n.h" 1606c3fb27SDimitry Andric #include "kmp_itt.h" 1706c3fb27SDimitry Andric #include "kmp_stats.h" 1806c3fb27SDimitry Andric #include "kmp_str.h" 1906c3fb27SDimitry Andric #include "kmp_collapse.h" 2006c3fb27SDimitry Andric 2106c3fb27SDimitry Andric #if OMPT_SUPPORT 2206c3fb27SDimitry Andric #include "ompt-specific.h" 2306c3fb27SDimitry Andric #endif 2406c3fb27SDimitry Andric 2506c3fb27SDimitry Andric // OMPTODO: different style of comments (see kmp_sched) 2606c3fb27SDimitry Andric // OMPTODO: OMPT/OMPD 2706c3fb27SDimitry Andric 2806c3fb27SDimitry Andric // avoid inadevertently using a library based abs 2906c3fb27SDimitry Andric template <typename T> T __kmp_abs(const T val) { 3006c3fb27SDimitry Andric return (val < 0) ? -val : val; 3106c3fb27SDimitry Andric } 3206c3fb27SDimitry Andric kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; } 3306c3fb27SDimitry Andric kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; } 3406c3fb27SDimitry Andric 3506c3fb27SDimitry Andric //---------------------------------------------------------------------------- 3606c3fb27SDimitry Andric // Common functions for working with rectangular and non-rectangular loops 3706c3fb27SDimitry Andric //---------------------------------------------------------------------------- 3806c3fb27SDimitry Andric 395f757f3fSDimitry Andric template <typename T> int __kmp_sign(T val) { 405f757f3fSDimitry Andric return (T(0) < val) - (val < T(0)); 415f757f3fSDimitry Andric } 425f757f3fSDimitry Andric 435f757f3fSDimitry Andric template <typename T> class CollapseAllocator { 445f757f3fSDimitry Andric typedef T *pT; 455f757f3fSDimitry Andric 465f757f3fSDimitry Andric private: 475f757f3fSDimitry Andric static const size_t allocaSize = 32; // size limit for stack allocations 485f757f3fSDimitry Andric // (8 bytes x 4 nested loops) 495f757f3fSDimitry Andric char stackAlloc[allocaSize]; 505f757f3fSDimitry Andric static constexpr size_t maxElemCount = allocaSize / sizeof(T); 515f757f3fSDimitry Andric pT pTAlloc; 525f757f3fSDimitry Andric 535f757f3fSDimitry Andric public: 545f757f3fSDimitry Andric CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) { 555f757f3fSDimitry Andric if (n > maxElemCount) { 565f757f3fSDimitry Andric pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T))); 575f757f3fSDimitry Andric } 585f757f3fSDimitry Andric } 595f757f3fSDimitry Andric ~CollapseAllocator() { 605f757f3fSDimitry Andric if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) { 615f757f3fSDimitry Andric __kmp_free(pTAlloc); 625f757f3fSDimitry Andric } 635f757f3fSDimitry Andric } 645f757f3fSDimitry Andric T &operator[](int index) { return pTAlloc[index]; } 655f757f3fSDimitry Andric operator const pT() { return pTAlloc; } 665f757f3fSDimitry Andric }; 6706c3fb27SDimitry Andric 6806c3fb27SDimitry Andric //----------Loop canonicalization--------------------------------------------- 6906c3fb27SDimitry Andric 7006c3fb27SDimitry Andric // For loop nest (any shape): 7106c3fb27SDimitry Andric // convert != to < or >; 7206c3fb27SDimitry Andric // switch from using < or > to <= or >=. 7306c3fb27SDimitry Andric // "bounds" array has to be allocated per thread. 7406c3fb27SDimitry Andric // All other internal functions will work only with canonicalized loops. 7506c3fb27SDimitry Andric template <typename T> 7606c3fb27SDimitry Andric void kmp_canonicalize_one_loop_XX( 7706c3fb27SDimitry Andric ident_t *loc, 7806c3fb27SDimitry Andric /*in/out*/ bounds_infoXX_template<T> *bounds) { 7906c3fb27SDimitry Andric 8006c3fb27SDimitry Andric if (__kmp_env_consistency_check) { 8106c3fb27SDimitry Andric if (bounds->step == 0) { 8206c3fb27SDimitry Andric __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, 8306c3fb27SDimitry Andric loc); 8406c3fb27SDimitry Andric } 8506c3fb27SDimitry Andric } 8606c3fb27SDimitry Andric 8706c3fb27SDimitry Andric if (bounds->comparison == comparison_t::comp_not_eq) { 8806c3fb27SDimitry Andric // We can convert this to < or >, depends on the sign of the step: 8906c3fb27SDimitry Andric if (bounds->step > 0) { 9006c3fb27SDimitry Andric bounds->comparison = comparison_t::comp_less; 9106c3fb27SDimitry Andric } else { 9206c3fb27SDimitry Andric bounds->comparison = comparison_t::comp_greater; 9306c3fb27SDimitry Andric } 9406c3fb27SDimitry Andric } 9506c3fb27SDimitry Andric 9606c3fb27SDimitry Andric if (bounds->comparison == comparison_t::comp_less) { 9706c3fb27SDimitry Andric // Note: ub0 can be unsigned. Should be Ok to hit overflow here, 9806c3fb27SDimitry Andric // because ub0 + ub1*j should be still positive (otherwise loop was not 9906c3fb27SDimitry Andric // well formed) 10006c3fb27SDimitry Andric bounds->ub0 -= 1; 10106c3fb27SDimitry Andric bounds->comparison = comparison_t::comp_less_or_eq; 10206c3fb27SDimitry Andric } else if (bounds->comparison == comparison_t::comp_greater) { 10306c3fb27SDimitry Andric bounds->ub0 += 1; 10406c3fb27SDimitry Andric bounds->comparison = comparison_t::comp_greater_or_eq; 10506c3fb27SDimitry Andric } 10606c3fb27SDimitry Andric } 10706c3fb27SDimitry Andric 10806c3fb27SDimitry Andric // Canonicalize loop nest. original_bounds_nest is an array of length n. 10906c3fb27SDimitry Andric void kmp_canonicalize_loop_nest(ident_t *loc, 11006c3fb27SDimitry Andric /*in/out*/ bounds_info_t *original_bounds_nest, 11106c3fb27SDimitry Andric kmp_index_t n) { 11206c3fb27SDimitry Andric 11306c3fb27SDimitry Andric for (kmp_index_t ind = 0; ind < n; ++ind) { 11406c3fb27SDimitry Andric auto bounds = &(original_bounds_nest[ind]); 11506c3fb27SDimitry Andric 11606c3fb27SDimitry Andric switch (bounds->loop_type) { 11706c3fb27SDimitry Andric case loop_type_t::loop_type_int32: 11806c3fb27SDimitry Andric kmp_canonicalize_one_loop_XX<kmp_int32>( 11906c3fb27SDimitry Andric loc, 12006c3fb27SDimitry Andric /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds)); 12106c3fb27SDimitry Andric break; 12206c3fb27SDimitry Andric case loop_type_t::loop_type_uint32: 12306c3fb27SDimitry Andric kmp_canonicalize_one_loop_XX<kmp_uint32>( 12406c3fb27SDimitry Andric loc, 12506c3fb27SDimitry Andric /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds)); 12606c3fb27SDimitry Andric break; 12706c3fb27SDimitry Andric case loop_type_t::loop_type_int64: 12806c3fb27SDimitry Andric kmp_canonicalize_one_loop_XX<kmp_int64>( 12906c3fb27SDimitry Andric loc, 13006c3fb27SDimitry Andric /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds)); 13106c3fb27SDimitry Andric break; 13206c3fb27SDimitry Andric case loop_type_t::loop_type_uint64: 13306c3fb27SDimitry Andric kmp_canonicalize_one_loop_XX<kmp_uint64>( 13406c3fb27SDimitry Andric loc, 13506c3fb27SDimitry Andric /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds)); 13606c3fb27SDimitry Andric break; 13706c3fb27SDimitry Andric default: 13806c3fb27SDimitry Andric KMP_ASSERT(false); 13906c3fb27SDimitry Andric } 14006c3fb27SDimitry Andric } 14106c3fb27SDimitry Andric } 14206c3fb27SDimitry Andric 14306c3fb27SDimitry Andric //----------Calculating trip count on one level------------------------------- 14406c3fb27SDimitry Andric 14506c3fb27SDimitry Andric // Calculate trip count on this loop level. 14606c3fb27SDimitry Andric // We do this either for a rectangular loop nest, 14706c3fb27SDimitry Andric // or after an adjustment bringing the loops to a parallelepiped shape. 14806c3fb27SDimitry Andric // This number should not depend on the value of outer IV 14906c3fb27SDimitry Andric // even if the formular has lb1 and ub1. 15006c3fb27SDimitry Andric // Note: for non-rectangular loops don't use span for this, it's too big. 15106c3fb27SDimitry Andric 15206c3fb27SDimitry Andric template <typename T> 15306c3fb27SDimitry Andric kmp_loop_nest_iv_t kmp_calculate_trip_count_XX( 15406c3fb27SDimitry Andric /*in/out*/ bounds_infoXX_template<T> *bounds) { 15506c3fb27SDimitry Andric 15606c3fb27SDimitry Andric if (bounds->comparison == comparison_t::comp_less_or_eq) { 15706c3fb27SDimitry Andric if (bounds->ub0 < bounds->lb0) { 15806c3fb27SDimitry Andric // Note: after this we don't need to calculate inner loops, 15906c3fb27SDimitry Andric // but that should be an edge case: 16006c3fb27SDimitry Andric bounds->trip_count = 0; 16106c3fb27SDimitry Andric } else { 16206c3fb27SDimitry Andric // ub - lb may exceed signed type range; we need to cast to 16306c3fb27SDimitry Andric // kmp_loop_nest_iv_t anyway 16406c3fb27SDimitry Andric bounds->trip_count = 16506c3fb27SDimitry Andric static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) / 16606c3fb27SDimitry Andric __kmp_abs(bounds->step) + 16706c3fb27SDimitry Andric 1; 16806c3fb27SDimitry Andric } 16906c3fb27SDimitry Andric } else if (bounds->comparison == comparison_t::comp_greater_or_eq) { 17006c3fb27SDimitry Andric if (bounds->lb0 < bounds->ub0) { 17106c3fb27SDimitry Andric // Note: after this we don't need to calculate inner loops, 17206c3fb27SDimitry Andric // but that should be an edge case: 17306c3fb27SDimitry Andric bounds->trip_count = 0; 17406c3fb27SDimitry Andric } else { 17506c3fb27SDimitry Andric // lb - ub may exceed signed type range; we need to cast to 17606c3fb27SDimitry Andric // kmp_loop_nest_iv_t anyway 17706c3fb27SDimitry Andric bounds->trip_count = 17806c3fb27SDimitry Andric static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) / 17906c3fb27SDimitry Andric __kmp_abs(bounds->step) + 18006c3fb27SDimitry Andric 1; 18106c3fb27SDimitry Andric } 18206c3fb27SDimitry Andric } else { 18306c3fb27SDimitry Andric KMP_ASSERT(false); 18406c3fb27SDimitry Andric } 18506c3fb27SDimitry Andric return bounds->trip_count; 18606c3fb27SDimitry Andric } 18706c3fb27SDimitry Andric 18806c3fb27SDimitry Andric // Calculate trip count on this loop level. 18906c3fb27SDimitry Andric kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) { 19006c3fb27SDimitry Andric 19106c3fb27SDimitry Andric kmp_loop_nest_iv_t trip_count = 0; 19206c3fb27SDimitry Andric 19306c3fb27SDimitry Andric switch (bounds->loop_type) { 19406c3fb27SDimitry Andric case loop_type_t::loop_type_int32: 19506c3fb27SDimitry Andric trip_count = kmp_calculate_trip_count_XX<kmp_int32>( 19606c3fb27SDimitry Andric /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds)); 19706c3fb27SDimitry Andric break; 19806c3fb27SDimitry Andric case loop_type_t::loop_type_uint32: 19906c3fb27SDimitry Andric trip_count = kmp_calculate_trip_count_XX<kmp_uint32>( 20006c3fb27SDimitry Andric /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds)); 20106c3fb27SDimitry Andric break; 20206c3fb27SDimitry Andric case loop_type_t::loop_type_int64: 20306c3fb27SDimitry Andric trip_count = kmp_calculate_trip_count_XX<kmp_int64>( 20406c3fb27SDimitry Andric /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds)); 20506c3fb27SDimitry Andric break; 20606c3fb27SDimitry Andric case loop_type_t::loop_type_uint64: 20706c3fb27SDimitry Andric trip_count = kmp_calculate_trip_count_XX<kmp_uint64>( 20806c3fb27SDimitry Andric /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds)); 20906c3fb27SDimitry Andric break; 21006c3fb27SDimitry Andric default: 21106c3fb27SDimitry Andric KMP_ASSERT(false); 21206c3fb27SDimitry Andric } 21306c3fb27SDimitry Andric 21406c3fb27SDimitry Andric return trip_count; 21506c3fb27SDimitry Andric } 21606c3fb27SDimitry Andric 21706c3fb27SDimitry Andric //----------Trim original iv according to its type---------------------------- 21806c3fb27SDimitry Andric 21906c3fb27SDimitry Andric // Trim original iv according to its type. 22006c3fb27SDimitry Andric // Return kmp_uint64 value which can be easily used in all internal calculations 22106c3fb27SDimitry Andric // And can be statically cast back to original type in user code. 22206c3fb27SDimitry Andric kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) { 22306c3fb27SDimitry Andric kmp_uint64 res = 0; 22406c3fb27SDimitry Andric 22506c3fb27SDimitry Andric switch (loop_iv_type) { 22606c3fb27SDimitry Andric case loop_type_t::loop_type_int8: 22706c3fb27SDimitry Andric res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv)); 22806c3fb27SDimitry Andric break; 22906c3fb27SDimitry Andric case loop_type_t::loop_type_uint8: 23006c3fb27SDimitry Andric res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv)); 23106c3fb27SDimitry Andric break; 23206c3fb27SDimitry Andric case loop_type_t::loop_type_int16: 23306c3fb27SDimitry Andric res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv)); 23406c3fb27SDimitry Andric break; 23506c3fb27SDimitry Andric case loop_type_t::loop_type_uint16: 23606c3fb27SDimitry Andric res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv)); 23706c3fb27SDimitry Andric break; 23806c3fb27SDimitry Andric case loop_type_t::loop_type_int32: 23906c3fb27SDimitry Andric res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv)); 24006c3fb27SDimitry Andric break; 24106c3fb27SDimitry Andric case loop_type_t::loop_type_uint32: 24206c3fb27SDimitry Andric res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv)); 24306c3fb27SDimitry Andric break; 24406c3fb27SDimitry Andric case loop_type_t::loop_type_int64: 24506c3fb27SDimitry Andric res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv)); 24606c3fb27SDimitry Andric break; 24706c3fb27SDimitry Andric case loop_type_t::loop_type_uint64: 24806c3fb27SDimitry Andric res = static_cast<kmp_uint64>(original_iv); 24906c3fb27SDimitry Andric break; 25006c3fb27SDimitry Andric default: 25106c3fb27SDimitry Andric KMP_ASSERT(false); 25206c3fb27SDimitry Andric } 25306c3fb27SDimitry Andric 25406c3fb27SDimitry Andric return res; 25506c3fb27SDimitry Andric } 25606c3fb27SDimitry Andric 25706c3fb27SDimitry Andric //----------Compare two IVs (remember they have a type)----------------------- 25806c3fb27SDimitry Andric 25906c3fb27SDimitry Andric bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1, 26006c3fb27SDimitry Andric kmp_uint64 original_iv2) { 26106c3fb27SDimitry Andric bool res = false; 26206c3fb27SDimitry Andric 26306c3fb27SDimitry Andric switch (loop_iv_type) { 26406c3fb27SDimitry Andric case loop_type_t::loop_type_int8: 26506c3fb27SDimitry Andric res = static_cast<kmp_int8>(original_iv1) == 26606c3fb27SDimitry Andric static_cast<kmp_int8>(original_iv2); 26706c3fb27SDimitry Andric break; 26806c3fb27SDimitry Andric case loop_type_t::loop_type_uint8: 26906c3fb27SDimitry Andric res = static_cast<kmp_uint8>(original_iv1) == 27006c3fb27SDimitry Andric static_cast<kmp_uint8>(original_iv2); 27106c3fb27SDimitry Andric break; 27206c3fb27SDimitry Andric case loop_type_t::loop_type_int16: 27306c3fb27SDimitry Andric res = static_cast<kmp_int16>(original_iv1) == 27406c3fb27SDimitry Andric static_cast<kmp_int16>(original_iv2); 27506c3fb27SDimitry Andric break; 27606c3fb27SDimitry Andric case loop_type_t::loop_type_uint16: 27706c3fb27SDimitry Andric res = static_cast<kmp_uint16>(original_iv1) == 27806c3fb27SDimitry Andric static_cast<kmp_uint16>(original_iv2); 27906c3fb27SDimitry Andric break; 28006c3fb27SDimitry Andric case loop_type_t::loop_type_int32: 28106c3fb27SDimitry Andric res = static_cast<kmp_int32>(original_iv1) == 28206c3fb27SDimitry Andric static_cast<kmp_int32>(original_iv2); 28306c3fb27SDimitry Andric break; 28406c3fb27SDimitry Andric case loop_type_t::loop_type_uint32: 28506c3fb27SDimitry Andric res = static_cast<kmp_uint32>(original_iv1) == 28606c3fb27SDimitry Andric static_cast<kmp_uint32>(original_iv2); 28706c3fb27SDimitry Andric break; 28806c3fb27SDimitry Andric case loop_type_t::loop_type_int64: 28906c3fb27SDimitry Andric res = static_cast<kmp_int64>(original_iv1) == 29006c3fb27SDimitry Andric static_cast<kmp_int64>(original_iv2); 29106c3fb27SDimitry Andric break; 29206c3fb27SDimitry Andric case loop_type_t::loop_type_uint64: 29306c3fb27SDimitry Andric res = static_cast<kmp_uint64>(original_iv1) == 29406c3fb27SDimitry Andric static_cast<kmp_uint64>(original_iv2); 29506c3fb27SDimitry Andric break; 29606c3fb27SDimitry Andric default: 29706c3fb27SDimitry Andric KMP_ASSERT(false); 29806c3fb27SDimitry Andric } 29906c3fb27SDimitry Andric 30006c3fb27SDimitry Andric return res; 30106c3fb27SDimitry Andric } 30206c3fb27SDimitry Andric 30306c3fb27SDimitry Andric //----------Calculate original iv on one level-------------------------------- 30406c3fb27SDimitry Andric 30506c3fb27SDimitry Andric // Return true if the point fits into upper bounds on this level, 30606c3fb27SDimitry Andric // false otherwise 30706c3fb27SDimitry Andric template <typename T> 30806c3fb27SDimitry Andric bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds, 30906c3fb27SDimitry Andric const kmp_point_t original_ivs, 31006c3fb27SDimitry Andric kmp_index_t ind) { 31106c3fb27SDimitry Andric 31206c3fb27SDimitry Andric T iv = static_cast<T>(original_ivs[ind]); 31306c3fb27SDimitry Andric T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]); 31406c3fb27SDimitry Andric 31506c3fb27SDimitry Andric if (((bounds->comparison == comparison_t::comp_less_or_eq) && 31606c3fb27SDimitry Andric (iv > (bounds->ub0 + bounds->ub1 * outer_iv))) || 31706c3fb27SDimitry Andric ((bounds->comparison == comparison_t::comp_greater_or_eq) && 31806c3fb27SDimitry Andric (iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) { 31906c3fb27SDimitry Andric // The calculated point is outside of loop upper boundary: 32006c3fb27SDimitry Andric return false; 32106c3fb27SDimitry Andric } 32206c3fb27SDimitry Andric 32306c3fb27SDimitry Andric return true; 32406c3fb27SDimitry Andric } 32506c3fb27SDimitry Andric 32606c3fb27SDimitry Andric // Calculate one iv corresponding to iteration on the level ind. 32706c3fb27SDimitry Andric // Return true if it fits into lower-upper bounds on this level 32806c3fb27SDimitry Andric // (if not, we need to re-calculate) 32906c3fb27SDimitry Andric template <typename T> 33006c3fb27SDimitry Andric bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds, 33106c3fb27SDimitry Andric /*in/out*/ kmp_point_t original_ivs, 33206c3fb27SDimitry Andric const kmp_iterations_t iterations, kmp_index_t ind, 33306c3fb27SDimitry Andric bool start_with_lower_bound, bool checkBounds) { 33406c3fb27SDimitry Andric 33506c3fb27SDimitry Andric kmp_uint64 temp = 0; 33606c3fb27SDimitry Andric T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]); 33706c3fb27SDimitry Andric 33806c3fb27SDimitry Andric if (start_with_lower_bound) { 33906c3fb27SDimitry Andric // we moved to the next iteration on one of outer loops, should start 34006c3fb27SDimitry Andric // with the lower bound here: 34106c3fb27SDimitry Andric temp = bounds->lb0 + bounds->lb1 * outer_iv; 34206c3fb27SDimitry Andric } else { 34306c3fb27SDimitry Andric auto iteration = iterations[ind]; 34406c3fb27SDimitry Andric temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step; 34506c3fb27SDimitry Andric } 34606c3fb27SDimitry Andric 34706c3fb27SDimitry Andric // Now trim original iv according to its type: 34806c3fb27SDimitry Andric original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp); 34906c3fb27SDimitry Andric 35006c3fb27SDimitry Andric if (checkBounds) { 35106c3fb27SDimitry Andric return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind); 35206c3fb27SDimitry Andric } else { 35306c3fb27SDimitry Andric return true; 35406c3fb27SDimitry Andric } 35506c3fb27SDimitry Andric } 35606c3fb27SDimitry Andric 35706c3fb27SDimitry Andric bool kmp_calc_one_iv(const bounds_info_t *bounds, 35806c3fb27SDimitry Andric /*in/out*/ kmp_point_t original_ivs, 35906c3fb27SDimitry Andric const kmp_iterations_t iterations, kmp_index_t ind, 36006c3fb27SDimitry Andric bool start_with_lower_bound, bool checkBounds) { 36106c3fb27SDimitry Andric 36206c3fb27SDimitry Andric switch (bounds->loop_type) { 36306c3fb27SDimitry Andric case loop_type_t::loop_type_int32: 36406c3fb27SDimitry Andric return kmp_calc_one_iv_XX<kmp_int32>( 36506c3fb27SDimitry Andric (bounds_infoXX_template<kmp_int32> *)(bounds), 36606c3fb27SDimitry Andric /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, 36706c3fb27SDimitry Andric checkBounds); 36806c3fb27SDimitry Andric break; 36906c3fb27SDimitry Andric case loop_type_t::loop_type_uint32: 37006c3fb27SDimitry Andric return kmp_calc_one_iv_XX<kmp_uint32>( 37106c3fb27SDimitry Andric (bounds_infoXX_template<kmp_uint32> *)(bounds), 37206c3fb27SDimitry Andric /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, 37306c3fb27SDimitry Andric checkBounds); 37406c3fb27SDimitry Andric break; 37506c3fb27SDimitry Andric case loop_type_t::loop_type_int64: 37606c3fb27SDimitry Andric return kmp_calc_one_iv_XX<kmp_int64>( 37706c3fb27SDimitry Andric (bounds_infoXX_template<kmp_int64> *)(bounds), 37806c3fb27SDimitry Andric /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, 37906c3fb27SDimitry Andric checkBounds); 38006c3fb27SDimitry Andric break; 38106c3fb27SDimitry Andric case loop_type_t::loop_type_uint64: 38206c3fb27SDimitry Andric return kmp_calc_one_iv_XX<kmp_uint64>( 38306c3fb27SDimitry Andric (bounds_infoXX_template<kmp_uint64> *)(bounds), 38406c3fb27SDimitry Andric /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound, 38506c3fb27SDimitry Andric checkBounds); 38606c3fb27SDimitry Andric break; 38706c3fb27SDimitry Andric default: 38806c3fb27SDimitry Andric KMP_ASSERT(false); 38906c3fb27SDimitry Andric return false; 39006c3fb27SDimitry Andric } 39106c3fb27SDimitry Andric } 39206c3fb27SDimitry Andric 39306c3fb27SDimitry Andric //----------Calculate original iv on one level for rectangular loop nest------ 39406c3fb27SDimitry Andric 39506c3fb27SDimitry Andric // Calculate one iv corresponding to iteration on the level ind. 39606c3fb27SDimitry Andric // Return true if it fits into lower-upper bounds on this level 39706c3fb27SDimitry Andric // (if not, we need to re-calculate) 39806c3fb27SDimitry Andric template <typename T> 39906c3fb27SDimitry Andric void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds, 40006c3fb27SDimitry Andric /*in/out*/ kmp_uint64 *original_ivs, 40106c3fb27SDimitry Andric const kmp_iterations_t iterations, 40206c3fb27SDimitry Andric kmp_index_t ind) { 40306c3fb27SDimitry Andric 40406c3fb27SDimitry Andric auto iteration = iterations[ind]; 40506c3fb27SDimitry Andric 40606c3fb27SDimitry Andric kmp_uint64 temp = 40706c3fb27SDimitry Andric bounds->lb0 + 40806c3fb27SDimitry Andric bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) + 40906c3fb27SDimitry Andric iteration * bounds->step; 41006c3fb27SDimitry Andric 41106c3fb27SDimitry Andric // Now trim original iv according to its type: 41206c3fb27SDimitry Andric original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp); 41306c3fb27SDimitry Andric } 41406c3fb27SDimitry Andric 41506c3fb27SDimitry Andric void kmp_calc_one_iv_rectang(const bounds_info_t *bounds, 41606c3fb27SDimitry Andric /*in/out*/ kmp_uint64 *original_ivs, 41706c3fb27SDimitry Andric const kmp_iterations_t iterations, 41806c3fb27SDimitry Andric kmp_index_t ind) { 41906c3fb27SDimitry Andric 42006c3fb27SDimitry Andric switch (bounds->loop_type) { 42106c3fb27SDimitry Andric case loop_type_t::loop_type_int32: 42206c3fb27SDimitry Andric kmp_calc_one_iv_rectang_XX<kmp_int32>( 42306c3fb27SDimitry Andric (bounds_infoXX_template<kmp_int32> *)(bounds), 42406c3fb27SDimitry Andric /*in/out*/ original_ivs, iterations, ind); 42506c3fb27SDimitry Andric break; 42606c3fb27SDimitry Andric case loop_type_t::loop_type_uint32: 42706c3fb27SDimitry Andric kmp_calc_one_iv_rectang_XX<kmp_uint32>( 42806c3fb27SDimitry Andric (bounds_infoXX_template<kmp_uint32> *)(bounds), 42906c3fb27SDimitry Andric /*in/out*/ original_ivs, iterations, ind); 43006c3fb27SDimitry Andric break; 43106c3fb27SDimitry Andric case loop_type_t::loop_type_int64: 43206c3fb27SDimitry Andric kmp_calc_one_iv_rectang_XX<kmp_int64>( 43306c3fb27SDimitry Andric (bounds_infoXX_template<kmp_int64> *)(bounds), 43406c3fb27SDimitry Andric /*in/out*/ original_ivs, iterations, ind); 43506c3fb27SDimitry Andric break; 43606c3fb27SDimitry Andric case loop_type_t::loop_type_uint64: 43706c3fb27SDimitry Andric kmp_calc_one_iv_rectang_XX<kmp_uint64>( 43806c3fb27SDimitry Andric (bounds_infoXX_template<kmp_uint64> *)(bounds), 43906c3fb27SDimitry Andric /*in/out*/ original_ivs, iterations, ind); 44006c3fb27SDimitry Andric break; 44106c3fb27SDimitry Andric default: 44206c3fb27SDimitry Andric KMP_ASSERT(false); 44306c3fb27SDimitry Andric } 44406c3fb27SDimitry Andric } 44506c3fb27SDimitry Andric 44606c3fb27SDimitry Andric //---------------------------------------------------------------------------- 44706c3fb27SDimitry Andric // Rectangular loop nest 44806c3fb27SDimitry Andric //---------------------------------------------------------------------------- 44906c3fb27SDimitry Andric 45006c3fb27SDimitry Andric //----------Canonicalize loop nest and calculate trip count------------------- 45106c3fb27SDimitry Andric 45206c3fb27SDimitry Andric // Canonicalize loop nest and calculate overall trip count. 45306c3fb27SDimitry Andric // "bounds_nest" has to be allocated per thread. 45406c3fb27SDimitry Andric // API will modify original bounds_nest array to bring it to a canonical form 45506c3fb27SDimitry Andric // (only <= and >=, no !=, <, >). If the original loop nest was already in a 45606c3fb27SDimitry Andric // canonical form there will be no changes to bounds in bounds_nest array 45706c3fb27SDimitry Andric // (only trip counts will be calculated). 45806c3fb27SDimitry Andric // Returns trip count of overall space. 45906c3fb27SDimitry Andric extern "C" kmp_loop_nest_iv_t 46006c3fb27SDimitry Andric __kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid, 46106c3fb27SDimitry Andric /*in/out*/ bounds_info_t *original_bounds_nest, 46206c3fb27SDimitry Andric kmp_index_t n) { 46306c3fb27SDimitry Andric 46406c3fb27SDimitry Andric kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n); 46506c3fb27SDimitry Andric 46606c3fb27SDimitry Andric kmp_loop_nest_iv_t total = 1; 46706c3fb27SDimitry Andric 46806c3fb27SDimitry Andric for (kmp_index_t ind = 0; ind < n; ++ind) { 46906c3fb27SDimitry Andric auto bounds = &(original_bounds_nest[ind]); 47006c3fb27SDimitry Andric 47106c3fb27SDimitry Andric kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds); 47206c3fb27SDimitry Andric total *= trip_count; 47306c3fb27SDimitry Andric } 47406c3fb27SDimitry Andric 47506c3fb27SDimitry Andric return total; 47606c3fb27SDimitry Andric } 47706c3fb27SDimitry Andric 47806c3fb27SDimitry Andric //----------Calculate old induction variables--------------------------------- 47906c3fb27SDimitry Andric 48006c3fb27SDimitry Andric // Calculate old induction variables corresponding to overall new_iv. 48106c3fb27SDimitry Andric // Note: original IV will be returned as if it had kmp_uint64 type, 48206c3fb27SDimitry Andric // will have to be converted to original type in user code. 48306c3fb27SDimitry Andric // Note: trip counts should be already calculated by 48406c3fb27SDimitry Andric // __kmpc_process_loop_nest_rectang. 48506c3fb27SDimitry Andric // OMPTODO: special case 2, 3 nested loops: either do different 48606c3fb27SDimitry Andric // interface without array or possibly template this over n 48706c3fb27SDimitry Andric extern "C" void 48806c3fb27SDimitry Andric __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv, 48906c3fb27SDimitry Andric const bounds_info_t *original_bounds_nest, 49006c3fb27SDimitry Andric /*out*/ kmp_uint64 *original_ivs, 49106c3fb27SDimitry Andric kmp_index_t n) { 49206c3fb27SDimitry Andric 4935f757f3fSDimitry Andric CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); 49406c3fb27SDimitry Andric 49506c3fb27SDimitry Andric // First, calc corresponding iteration in every original loop: 49606c3fb27SDimitry Andric for (kmp_index_t ind = n; ind > 0;) { 49706c3fb27SDimitry Andric --ind; 49806c3fb27SDimitry Andric auto bounds = &(original_bounds_nest[ind]); 49906c3fb27SDimitry Andric 50006c3fb27SDimitry Andric // should be optimized to OPDIVREM: 50106c3fb27SDimitry Andric auto temp = new_iv / bounds->trip_count; 50206c3fb27SDimitry Andric auto iteration = new_iv % bounds->trip_count; 50306c3fb27SDimitry Andric new_iv = temp; 50406c3fb27SDimitry Andric 50506c3fb27SDimitry Andric iterations[ind] = iteration; 50606c3fb27SDimitry Andric } 50706c3fb27SDimitry Andric KMP_ASSERT(new_iv == 0); 50806c3fb27SDimitry Andric 50906c3fb27SDimitry Andric for (kmp_index_t ind = 0; ind < n; ++ind) { 51006c3fb27SDimitry Andric auto bounds = &(original_bounds_nest[ind]); 51106c3fb27SDimitry Andric 51206c3fb27SDimitry Andric kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind); 51306c3fb27SDimitry Andric } 51406c3fb27SDimitry Andric } 51506c3fb27SDimitry Andric 51606c3fb27SDimitry Andric //---------------------------------------------------------------------------- 51706c3fb27SDimitry Andric // Non-rectangular loop nest 51806c3fb27SDimitry Andric //---------------------------------------------------------------------------- 51906c3fb27SDimitry Andric 52006c3fb27SDimitry Andric //----------Calculate maximum possible span of iv values on one level--------- 52106c3fb27SDimitry Andric 52206c3fb27SDimitry Andric // Calculate span for IV on this loop level for "<=" case. 52306c3fb27SDimitry Andric // Note: it's for <= on this loop nest level, so lower bound should be smallest 52406c3fb27SDimitry Andric // value, upper bound should be the biggest value. If the loop won't execute, 52506c3fb27SDimitry Andric // 'smallest' may be bigger than 'biggest', but we'd better not switch them 52606c3fb27SDimitry Andric // around. 52706c3fb27SDimitry Andric template <typename T> 52806c3fb27SDimitry Andric void kmp_calc_span_lessoreq_XX( 52906c3fb27SDimitry Andric /* in/out*/ bounds_info_internalXX_template<T> *bounds, 53006c3fb27SDimitry Andric /* in/out*/ bounds_info_internal_t *bounds_nest) { 53106c3fb27SDimitry Andric 53206c3fb27SDimitry Andric typedef typename traits_t<T>::unsigned_t UT; 53306c3fb27SDimitry Andric // typedef typename traits_t<T>::signed_t ST; 53406c3fb27SDimitry Andric 53506c3fb27SDimitry Andric // typedef typename big_span_t span_t; 53606c3fb27SDimitry Andric typedef T span_t; 53706c3fb27SDimitry Andric 53806c3fb27SDimitry Andric auto &bbounds = bounds->b; 53906c3fb27SDimitry Andric 54006c3fb27SDimitry Andric if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) { 54106c3fb27SDimitry Andric // This dimention depends on one of previous ones; can't be the outermost 54206c3fb27SDimitry Andric // one. 54306c3fb27SDimitry Andric bounds_info_internalXX_template<T> *previous = 54406c3fb27SDimitry Andric reinterpret_cast<bounds_info_internalXX_template<T> *>( 54506c3fb27SDimitry Andric &(bounds_nest[bbounds.outer_iv])); 54606c3fb27SDimitry Andric 54706c3fb27SDimitry Andric // OMPTODO: assert that T is compatible with loop variable type on 54806c3fb27SDimitry Andric // 'previous' loop 54906c3fb27SDimitry Andric 55006c3fb27SDimitry Andric { 55106c3fb27SDimitry Andric span_t bound_candidate1 = 55206c3fb27SDimitry Andric bbounds.lb0 + bbounds.lb1 * previous->span_smallest; 55306c3fb27SDimitry Andric span_t bound_candidate2 = 55406c3fb27SDimitry Andric bbounds.lb0 + bbounds.lb1 * previous->span_biggest; 55506c3fb27SDimitry Andric if (bound_candidate1 < bound_candidate2) { 55606c3fb27SDimitry Andric bounds->span_smallest = bound_candidate1; 55706c3fb27SDimitry Andric } else { 55806c3fb27SDimitry Andric bounds->span_smallest = bound_candidate2; 55906c3fb27SDimitry Andric } 56006c3fb27SDimitry Andric } 56106c3fb27SDimitry Andric 56206c3fb27SDimitry Andric { 56306c3fb27SDimitry Andric // We can't adjust the upper bound with respect to step, because 56406c3fb27SDimitry Andric // lower bound might be off after adjustments 56506c3fb27SDimitry Andric 56606c3fb27SDimitry Andric span_t bound_candidate1 = 56706c3fb27SDimitry Andric bbounds.ub0 + bbounds.ub1 * previous->span_smallest; 56806c3fb27SDimitry Andric span_t bound_candidate2 = 56906c3fb27SDimitry Andric bbounds.ub0 + bbounds.ub1 * previous->span_biggest; 57006c3fb27SDimitry Andric if (bound_candidate1 < bound_candidate2) { 57106c3fb27SDimitry Andric bounds->span_biggest = bound_candidate2; 57206c3fb27SDimitry Andric } else { 57306c3fb27SDimitry Andric bounds->span_biggest = bound_candidate1; 57406c3fb27SDimitry Andric } 57506c3fb27SDimitry Andric } 57606c3fb27SDimitry Andric } else { 57706c3fb27SDimitry Andric // Rectangular: 57806c3fb27SDimitry Andric bounds->span_smallest = bbounds.lb0; 57906c3fb27SDimitry Andric bounds->span_biggest = bbounds.ub0; 58006c3fb27SDimitry Andric } 58106c3fb27SDimitry Andric if (!bounds->loop_bounds_adjusted) { 58206c3fb27SDimitry Andric // Here it's safe to reduce the space to the multiply of step. 58306c3fb27SDimitry Andric // OMPTODO: check if the formular is correct. 58406c3fb27SDimitry Andric // Also check if it would be safe to do this if we didn't adjust left side. 58506c3fb27SDimitry Andric bounds->span_biggest -= 58606c3fb27SDimitry Andric (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs? 58706c3fb27SDimitry Andric } 58806c3fb27SDimitry Andric } 58906c3fb27SDimitry Andric 59006c3fb27SDimitry Andric // Calculate span for IV on this loop level for ">=" case. 59106c3fb27SDimitry Andric template <typename T> 59206c3fb27SDimitry Andric void kmp_calc_span_greateroreq_XX( 59306c3fb27SDimitry Andric /* in/out*/ bounds_info_internalXX_template<T> *bounds, 59406c3fb27SDimitry Andric /* in/out*/ bounds_info_internal_t *bounds_nest) { 59506c3fb27SDimitry Andric 59606c3fb27SDimitry Andric typedef typename traits_t<T>::unsigned_t UT; 59706c3fb27SDimitry Andric // typedef typename traits_t<T>::signed_t ST; 59806c3fb27SDimitry Andric 59906c3fb27SDimitry Andric // typedef typename big_span_t span_t; 60006c3fb27SDimitry Andric typedef T span_t; 60106c3fb27SDimitry Andric 60206c3fb27SDimitry Andric auto &bbounds = bounds->b; 60306c3fb27SDimitry Andric 60406c3fb27SDimitry Andric if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) { 60506c3fb27SDimitry Andric // This dimention depends on one of previous ones; can't be the outermost 60606c3fb27SDimitry Andric // one. 60706c3fb27SDimitry Andric bounds_info_internalXX_template<T> *previous = 60806c3fb27SDimitry Andric reinterpret_cast<bounds_info_internalXX_template<T> *>( 60906c3fb27SDimitry Andric &(bounds_nest[bbounds.outer_iv])); 61006c3fb27SDimitry Andric 61106c3fb27SDimitry Andric // OMPTODO: assert that T is compatible with loop variable type on 61206c3fb27SDimitry Andric // 'previous' loop 61306c3fb27SDimitry Andric 61406c3fb27SDimitry Andric { 61506c3fb27SDimitry Andric span_t bound_candidate1 = 61606c3fb27SDimitry Andric bbounds.lb0 + bbounds.lb1 * previous->span_smallest; 61706c3fb27SDimitry Andric span_t bound_candidate2 = 61806c3fb27SDimitry Andric bbounds.lb0 + bbounds.lb1 * previous->span_biggest; 61906c3fb27SDimitry Andric if (bound_candidate1 >= bound_candidate2) { 62006c3fb27SDimitry Andric bounds->span_smallest = bound_candidate1; 62106c3fb27SDimitry Andric } else { 62206c3fb27SDimitry Andric bounds->span_smallest = bound_candidate2; 62306c3fb27SDimitry Andric } 62406c3fb27SDimitry Andric } 62506c3fb27SDimitry Andric 62606c3fb27SDimitry Andric { 62706c3fb27SDimitry Andric // We can't adjust the upper bound with respect to step, because 62806c3fb27SDimitry Andric // lower bound might be off after adjustments 62906c3fb27SDimitry Andric 63006c3fb27SDimitry Andric span_t bound_candidate1 = 63106c3fb27SDimitry Andric bbounds.ub0 + bbounds.ub1 * previous->span_smallest; 63206c3fb27SDimitry Andric span_t bound_candidate2 = 63306c3fb27SDimitry Andric bbounds.ub0 + bbounds.ub1 * previous->span_biggest; 63406c3fb27SDimitry Andric if (bound_candidate1 >= bound_candidate2) { 63506c3fb27SDimitry Andric bounds->span_biggest = bound_candidate2; 63606c3fb27SDimitry Andric } else { 63706c3fb27SDimitry Andric bounds->span_biggest = bound_candidate1; 63806c3fb27SDimitry Andric } 63906c3fb27SDimitry Andric } 64006c3fb27SDimitry Andric 64106c3fb27SDimitry Andric } else { 64206c3fb27SDimitry Andric // Rectangular: 64306c3fb27SDimitry Andric bounds->span_biggest = bbounds.lb0; 64406c3fb27SDimitry Andric bounds->span_smallest = bbounds.ub0; 64506c3fb27SDimitry Andric } 64606c3fb27SDimitry Andric if (!bounds->loop_bounds_adjusted) { 64706c3fb27SDimitry Andric // Here it's safe to reduce the space to the multiply of step. 64806c3fb27SDimitry Andric // OMPTODO: check if the formular is correct. 64906c3fb27SDimitry Andric // Also check if it would be safe to do this if we didn't adjust left side. 65006c3fb27SDimitry Andric bounds->span_biggest -= 65106c3fb27SDimitry Andric (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs? 65206c3fb27SDimitry Andric } 65306c3fb27SDimitry Andric } 65406c3fb27SDimitry Andric 65506c3fb27SDimitry Andric // Calculate maximum possible span for IV on this loop level. 65606c3fb27SDimitry Andric template <typename T> 65706c3fb27SDimitry Andric void kmp_calc_span_XX( 65806c3fb27SDimitry Andric /* in/out*/ bounds_info_internalXX_template<T> *bounds, 65906c3fb27SDimitry Andric /* in/out*/ bounds_info_internal_t *bounds_nest) { 66006c3fb27SDimitry Andric 66106c3fb27SDimitry Andric if (bounds->b.comparison == comparison_t::comp_less_or_eq) { 66206c3fb27SDimitry Andric kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); 66306c3fb27SDimitry Andric } else { 66406c3fb27SDimitry Andric KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq); 66506c3fb27SDimitry Andric kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); 66606c3fb27SDimitry Andric } 66706c3fb27SDimitry Andric } 66806c3fb27SDimitry Andric 66906c3fb27SDimitry Andric //----------All initial processing of the loop nest--------------------------- 67006c3fb27SDimitry Andric 67106c3fb27SDimitry Andric // Calculate new bounds for this loop level. 67206c3fb27SDimitry Andric // To be able to work with the nest we need to get it to a parallelepiped shape. 67306c3fb27SDimitry Andric // We need to stay in the original range of values, so that there will be no 67406c3fb27SDimitry Andric // overflow, for that we'll adjust both upper and lower bounds as needed. 67506c3fb27SDimitry Andric template <typename T> 67606c3fb27SDimitry Andric void kmp_calc_new_bounds_XX( 67706c3fb27SDimitry Andric /* in/out*/ bounds_info_internalXX_template<T> *bounds, 67806c3fb27SDimitry Andric /* in/out*/ bounds_info_internal_t *bounds_nest) { 67906c3fb27SDimitry Andric 68006c3fb27SDimitry Andric auto &bbounds = bounds->b; 68106c3fb27SDimitry Andric 68206c3fb27SDimitry Andric if (bbounds.lb1 == bbounds.ub1) { 68306c3fb27SDimitry Andric // Already parallel, no need to adjust: 68406c3fb27SDimitry Andric bounds->loop_bounds_adjusted = false; 68506c3fb27SDimitry Andric } else { 68606c3fb27SDimitry Andric bounds->loop_bounds_adjusted = true; 68706c3fb27SDimitry Andric 68806c3fb27SDimitry Andric T old_lb1 = bbounds.lb1; 68906c3fb27SDimitry Andric T old_ub1 = bbounds.ub1; 69006c3fb27SDimitry Andric 69106c3fb27SDimitry Andric if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) { 69206c3fb27SDimitry Andric // With this shape we can adjust to a rectangle: 69306c3fb27SDimitry Andric bbounds.lb1 = 0; 69406c3fb27SDimitry Andric bbounds.ub1 = 0; 69506c3fb27SDimitry Andric } else { 69606c3fb27SDimitry Andric // get upper and lower bounds to be parallel 69706c3fb27SDimitry Andric // with values in the old range. 69806c3fb27SDimitry Andric // Note: abs didn't work here. 69906c3fb27SDimitry Andric if (((old_lb1 < 0) && (old_lb1 < old_ub1)) || 70006c3fb27SDimitry Andric ((old_lb1 > 0) && (old_lb1 > old_ub1))) { 70106c3fb27SDimitry Andric bbounds.lb1 = old_ub1; 70206c3fb27SDimitry Andric } else { 70306c3fb27SDimitry Andric bbounds.ub1 = old_lb1; 70406c3fb27SDimitry Andric } 70506c3fb27SDimitry Andric } 70606c3fb27SDimitry Andric 70706c3fb27SDimitry Andric // Now need to adjust lb0, ub0, otherwise in some cases space will shrink. 70806c3fb27SDimitry Andric // The idea here that for this IV we are now getting the same span 70906c3fb27SDimitry Andric // irrespective of the previous IV value. 71006c3fb27SDimitry Andric bounds_info_internalXX_template<T> *previous = 71106c3fb27SDimitry Andric reinterpret_cast<bounds_info_internalXX_template<T> *>( 71206c3fb27SDimitry Andric &bounds_nest[bbounds.outer_iv]); 71306c3fb27SDimitry Andric 71406c3fb27SDimitry Andric if (bbounds.comparison == comparison_t::comp_less_or_eq) { 71506c3fb27SDimitry Andric if (old_lb1 < bbounds.lb1) { 71606c3fb27SDimitry Andric KMP_ASSERT(old_lb1 < 0); 71706c3fb27SDimitry Andric // The length is good on outer_iv biggest number, 71806c3fb27SDimitry Andric // can use it to find where to move the lower bound: 71906c3fb27SDimitry Andric 72006c3fb27SDimitry Andric T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest; 72106c3fb27SDimitry Andric bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space? 72206c3fb27SDimitry Andric // e.g. it was 0?? (same below) 72306c3fb27SDimitry Andric } else if (old_lb1 > bbounds.lb1) { 72406c3fb27SDimitry Andric // still need to move lower bound: 72506c3fb27SDimitry Andric T add = (old_lb1 - bbounds.lb1) * previous->span_smallest; 72606c3fb27SDimitry Andric bbounds.lb0 += add; 72706c3fb27SDimitry Andric } 72806c3fb27SDimitry Andric 72906c3fb27SDimitry Andric if (old_ub1 > bbounds.ub1) { 73006c3fb27SDimitry Andric KMP_ASSERT(old_ub1 > 0); 73106c3fb27SDimitry Andric // The length is good on outer_iv biggest number, 73206c3fb27SDimitry Andric // can use it to find where to move upper bound: 73306c3fb27SDimitry Andric 73406c3fb27SDimitry Andric T add = (old_ub1 - bbounds.ub1) * previous->span_biggest; 73506c3fb27SDimitry Andric bbounds.ub0 += add; 73606c3fb27SDimitry Andric } else if (old_ub1 < bbounds.ub1) { 73706c3fb27SDimitry Andric // still need to move upper bound: 73806c3fb27SDimitry Andric T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest; 73906c3fb27SDimitry Andric bbounds.ub0 -= sub; 74006c3fb27SDimitry Andric } 74106c3fb27SDimitry Andric } else { 74206c3fb27SDimitry Andric KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq); 74306c3fb27SDimitry Andric if (old_lb1 < bbounds.lb1) { 74406c3fb27SDimitry Andric KMP_ASSERT(old_lb1 < 0); 74506c3fb27SDimitry Andric T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest; 74606c3fb27SDimitry Andric bbounds.lb0 -= sub; 74706c3fb27SDimitry Andric } else if (old_lb1 > bbounds.lb1) { 74806c3fb27SDimitry Andric T add = (old_lb1 - bbounds.lb1) * previous->span_biggest; 74906c3fb27SDimitry Andric bbounds.lb0 += add; 75006c3fb27SDimitry Andric } 75106c3fb27SDimitry Andric 75206c3fb27SDimitry Andric if (old_ub1 > bbounds.ub1) { 75306c3fb27SDimitry Andric KMP_ASSERT(old_ub1 > 0); 75406c3fb27SDimitry Andric T add = (old_ub1 - bbounds.ub1) * previous->span_smallest; 75506c3fb27SDimitry Andric bbounds.ub0 += add; 75606c3fb27SDimitry Andric } else if (old_ub1 < bbounds.ub1) { 75706c3fb27SDimitry Andric T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest; 75806c3fb27SDimitry Andric bbounds.ub0 -= sub; 75906c3fb27SDimitry Andric } 76006c3fb27SDimitry Andric } 76106c3fb27SDimitry Andric } 76206c3fb27SDimitry Andric } 76306c3fb27SDimitry Andric 76406c3fb27SDimitry Andric // Do all processing for one canonicalized loop in the nest 76506c3fb27SDimitry Andric // (assuming that outer loops already were processed): 76606c3fb27SDimitry Andric template <typename T> 76706c3fb27SDimitry Andric kmp_loop_nest_iv_t kmp_process_one_loop_XX( 76806c3fb27SDimitry Andric /* in/out*/ bounds_info_internalXX_template<T> *bounds, 76906c3fb27SDimitry Andric /*in/out*/ bounds_info_internal_t *bounds_nest) { 77006c3fb27SDimitry Andric 77106c3fb27SDimitry Andric kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); 77206c3fb27SDimitry Andric kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest); 77306c3fb27SDimitry Andric return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b)); 77406c3fb27SDimitry Andric } 77506c3fb27SDimitry Andric 77606c3fb27SDimitry Andric // Non-rectangular loop nest, canonicalized to use <= or >=. 77706c3fb27SDimitry Andric // Process loop nest to have a parallelepiped shape, 77806c3fb27SDimitry Andric // calculate biggest spans for IV's on all levels and calculate overall trip 77906c3fb27SDimitry Andric // count. "bounds_nest" has to be allocated per thread. 78006c3fb27SDimitry Andric // Returns overall trip count (for adjusted space). 78106c3fb27SDimitry Andric kmp_loop_nest_iv_t kmp_process_loop_nest( 78206c3fb27SDimitry Andric /*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) { 78306c3fb27SDimitry Andric 78406c3fb27SDimitry Andric kmp_loop_nest_iv_t total = 1; 78506c3fb27SDimitry Andric 78606c3fb27SDimitry Andric for (kmp_index_t ind = 0; ind < n; ++ind) { 78706c3fb27SDimitry Andric auto bounds = &(bounds_nest[ind]); 78806c3fb27SDimitry Andric kmp_loop_nest_iv_t trip_count = 0; 78906c3fb27SDimitry Andric 79006c3fb27SDimitry Andric switch (bounds->b.loop_type) { 79106c3fb27SDimitry Andric case loop_type_t::loop_type_int32: 79206c3fb27SDimitry Andric trip_count = kmp_process_one_loop_XX<kmp_int32>( 79306c3fb27SDimitry Andric /*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds), 79406c3fb27SDimitry Andric /*in/out*/ bounds_nest); 79506c3fb27SDimitry Andric break; 79606c3fb27SDimitry Andric case loop_type_t::loop_type_uint32: 79706c3fb27SDimitry Andric trip_count = kmp_process_one_loop_XX<kmp_uint32>( 79806c3fb27SDimitry Andric /*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds), 79906c3fb27SDimitry Andric /*in/out*/ bounds_nest); 80006c3fb27SDimitry Andric break; 80106c3fb27SDimitry Andric case loop_type_t::loop_type_int64: 80206c3fb27SDimitry Andric trip_count = kmp_process_one_loop_XX<kmp_int64>( 80306c3fb27SDimitry Andric /*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds), 80406c3fb27SDimitry Andric /*in/out*/ bounds_nest); 80506c3fb27SDimitry Andric break; 80606c3fb27SDimitry Andric case loop_type_t::loop_type_uint64: 80706c3fb27SDimitry Andric trip_count = kmp_process_one_loop_XX<kmp_uint64>( 80806c3fb27SDimitry Andric /*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds), 80906c3fb27SDimitry Andric /*in/out*/ bounds_nest); 81006c3fb27SDimitry Andric break; 81106c3fb27SDimitry Andric default: 81206c3fb27SDimitry Andric KMP_ASSERT(false); 81306c3fb27SDimitry Andric } 81406c3fb27SDimitry Andric total *= trip_count; 81506c3fb27SDimitry Andric } 81606c3fb27SDimitry Andric 81706c3fb27SDimitry Andric return total; 81806c3fb27SDimitry Andric } 81906c3fb27SDimitry Andric 82006c3fb27SDimitry Andric //----------Calculate iterations (in the original or updated space)----------- 82106c3fb27SDimitry Andric 82206c3fb27SDimitry Andric // Calculate number of iterations in original or updated space resulting in 82306c3fb27SDimitry Andric // original_ivs[ind] (only on this level, non-negative) 82406c3fb27SDimitry Andric // (not counting initial iteration) 82506c3fb27SDimitry Andric template <typename T> 82606c3fb27SDimitry Andric kmp_loop_nest_iv_t 82706c3fb27SDimitry Andric kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds, 82806c3fb27SDimitry Andric const kmp_point_t original_ivs, 82906c3fb27SDimitry Andric kmp_index_t ind) { 83006c3fb27SDimitry Andric 83106c3fb27SDimitry Andric kmp_loop_nest_iv_t iterations = 0; 83206c3fb27SDimitry Andric 83306c3fb27SDimitry Andric if (bounds->comparison == comparison_t::comp_less_or_eq) { 83406c3fb27SDimitry Andric iterations = 83506c3fb27SDimitry Andric (static_cast<T>(original_ivs[ind]) - bounds->lb0 - 83606c3fb27SDimitry Andric bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) / 83706c3fb27SDimitry Andric __kmp_abs(bounds->step); 83806c3fb27SDimitry Andric } else { 83906c3fb27SDimitry Andric KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq); 84006c3fb27SDimitry Andric iterations = (bounds->lb0 + 84106c3fb27SDimitry Andric bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) - 84206c3fb27SDimitry Andric static_cast<T>(original_ivs[ind])) / 84306c3fb27SDimitry Andric __kmp_abs(bounds->step); 84406c3fb27SDimitry Andric } 84506c3fb27SDimitry Andric 84606c3fb27SDimitry Andric return iterations; 84706c3fb27SDimitry Andric } 84806c3fb27SDimitry Andric 84906c3fb27SDimitry Andric // Calculate number of iterations in the original or updated space resulting in 85006c3fb27SDimitry Andric // original_ivs[ind] (only on this level, non-negative) 85106c3fb27SDimitry Andric kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds, 85206c3fb27SDimitry Andric const kmp_point_t original_ivs, 85306c3fb27SDimitry Andric kmp_index_t ind) { 85406c3fb27SDimitry Andric 85506c3fb27SDimitry Andric switch (bounds->loop_type) { 85606c3fb27SDimitry Andric case loop_type_t::loop_type_int32: 85706c3fb27SDimitry Andric return kmp_calc_number_of_iterations_XX<kmp_int32>( 85806c3fb27SDimitry Andric (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind); 85906c3fb27SDimitry Andric break; 86006c3fb27SDimitry Andric case loop_type_t::loop_type_uint32: 86106c3fb27SDimitry Andric return kmp_calc_number_of_iterations_XX<kmp_uint32>( 86206c3fb27SDimitry Andric (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind); 86306c3fb27SDimitry Andric break; 86406c3fb27SDimitry Andric case loop_type_t::loop_type_int64: 86506c3fb27SDimitry Andric return kmp_calc_number_of_iterations_XX<kmp_int64>( 86606c3fb27SDimitry Andric (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind); 86706c3fb27SDimitry Andric break; 86806c3fb27SDimitry Andric case loop_type_t::loop_type_uint64: 86906c3fb27SDimitry Andric return kmp_calc_number_of_iterations_XX<kmp_uint64>( 87006c3fb27SDimitry Andric (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind); 87106c3fb27SDimitry Andric break; 87206c3fb27SDimitry Andric default: 87306c3fb27SDimitry Andric KMP_ASSERT(false); 87406c3fb27SDimitry Andric return 0; 87506c3fb27SDimitry Andric } 87606c3fb27SDimitry Andric } 87706c3fb27SDimitry Andric 87806c3fb27SDimitry Andric //----------Calculate new iv corresponding to original ivs-------------------- 87906c3fb27SDimitry Andric 88006c3fb27SDimitry Andric // We got a point in the original loop nest. 88106c3fb27SDimitry Andric // Take updated bounds and calculate what new_iv will correspond to this point. 88206c3fb27SDimitry Andric // When we are getting original IVs from new_iv, we have to adjust to fit into 88306c3fb27SDimitry Andric // original loops bounds. Getting new_iv for the adjusted original IVs will help 88406c3fb27SDimitry Andric // with making more chunks non-empty. 88506c3fb27SDimitry Andric kmp_loop_nest_iv_t 88606c3fb27SDimitry Andric kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest, 88706c3fb27SDimitry Andric const kmp_point_t original_ivs, 88806c3fb27SDimitry Andric kmp_index_t n) { 88906c3fb27SDimitry Andric 89006c3fb27SDimitry Andric kmp_loop_nest_iv_t new_iv = 0; 89106c3fb27SDimitry Andric 89206c3fb27SDimitry Andric for (kmp_index_t ind = 0; ind < n; ++ind) { 89306c3fb27SDimitry Andric auto bounds = &(bounds_nest[ind].b); 89406c3fb27SDimitry Andric 89506c3fb27SDimitry Andric new_iv = new_iv * bounds->trip_count + 89606c3fb27SDimitry Andric kmp_calc_number_of_iterations(bounds, original_ivs, ind); 89706c3fb27SDimitry Andric } 89806c3fb27SDimitry Andric 89906c3fb27SDimitry Andric return new_iv; 90006c3fb27SDimitry Andric } 90106c3fb27SDimitry Andric 90206c3fb27SDimitry Andric //----------Calculate original ivs for provided iterations-------------------- 90306c3fb27SDimitry Andric 90406c3fb27SDimitry Andric // Calculate original IVs for provided iterations, assuming iterations are 90506c3fb27SDimitry Andric // calculated in the original space. 90606c3fb27SDimitry Andric // Loop nest is in canonical form (with <= / >=). 90706c3fb27SDimitry Andric bool kmp_calc_original_ivs_from_iterations( 90806c3fb27SDimitry Andric const bounds_info_t *original_bounds_nest, kmp_index_t n, 90906c3fb27SDimitry Andric /*in/out*/ kmp_point_t original_ivs, 91006c3fb27SDimitry Andric /*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) { 91106c3fb27SDimitry Andric 91206c3fb27SDimitry Andric kmp_index_t lengthened_ind = n; 91306c3fb27SDimitry Andric 91406c3fb27SDimitry Andric for (; ind < n;) { 91506c3fb27SDimitry Andric auto bounds = &(original_bounds_nest[ind]); 91606c3fb27SDimitry Andric bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations, 91706c3fb27SDimitry Andric ind, (lengthened_ind < ind), true); 91806c3fb27SDimitry Andric 91906c3fb27SDimitry Andric if (!good) { 92006c3fb27SDimitry Andric // The calculated iv value is too big (or too small for >=): 92106c3fb27SDimitry Andric if (ind == 0) { 92206c3fb27SDimitry Andric // Space is empty: 92306c3fb27SDimitry Andric return false; 92406c3fb27SDimitry Andric } else { 92506c3fb27SDimitry Andric // Go to next iteration on the outer loop: 92606c3fb27SDimitry Andric --ind; 92706c3fb27SDimitry Andric ++iterations[ind]; 92806c3fb27SDimitry Andric lengthened_ind = ind; 92906c3fb27SDimitry Andric for (kmp_index_t i = ind + 1; i < n; ++i) { 93006c3fb27SDimitry Andric iterations[i] = 0; 93106c3fb27SDimitry Andric } 93206c3fb27SDimitry Andric continue; 93306c3fb27SDimitry Andric } 93406c3fb27SDimitry Andric } 93506c3fb27SDimitry Andric ++ind; 93606c3fb27SDimitry Andric } 93706c3fb27SDimitry Andric 93806c3fb27SDimitry Andric return true; 93906c3fb27SDimitry Andric } 94006c3fb27SDimitry Andric 94106c3fb27SDimitry Andric //----------Calculate original ivs for the beginning of the loop nest--------- 94206c3fb27SDimitry Andric 94306c3fb27SDimitry Andric // Calculate IVs for the beginning of the loop nest. 94406c3fb27SDimitry Andric // Note: lower bounds of all loops may not work - 94506c3fb27SDimitry Andric // if on some of the iterations of the outer loops inner loops are empty. 94606c3fb27SDimitry Andric // Loop nest is in canonical form (with <= / >=). 94706c3fb27SDimitry Andric bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest, 94806c3fb27SDimitry Andric kmp_index_t n, 94906c3fb27SDimitry Andric /*out*/ kmp_point_t original_ivs) { 95006c3fb27SDimitry Andric 95106c3fb27SDimitry Andric // Iterations in the original space, multiplied by step: 9525f757f3fSDimitry Andric CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); 95306c3fb27SDimitry Andric for (kmp_index_t ind = n; ind > 0;) { 95406c3fb27SDimitry Andric --ind; 95506c3fb27SDimitry Andric iterations[ind] = 0; 95606c3fb27SDimitry Andric } 95706c3fb27SDimitry Andric 95806c3fb27SDimitry Andric // Now calculate the point: 95906c3fb27SDimitry Andric bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n, 96006c3fb27SDimitry Andric /*in/out*/ original_ivs, 96106c3fb27SDimitry Andric /*in/out*/ iterations, 0); 96206c3fb27SDimitry Andric return b; 96306c3fb27SDimitry Andric } 96406c3fb27SDimitry Andric 96506c3fb27SDimitry Andric //----------Calculate next point in the original loop space------------------- 96606c3fb27SDimitry Andric 96706c3fb27SDimitry Andric // From current set of original IVs calculate next point. 96806c3fb27SDimitry Andric // Return false if there is no next point in the loop bounds. 96906c3fb27SDimitry Andric bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest, 97006c3fb27SDimitry Andric kmp_index_t n, const kmp_point_t original_ivs, 97106c3fb27SDimitry Andric /*out*/ kmp_point_t next_original_ivs) { 97206c3fb27SDimitry Andric // Iterations in the original space, multiplied by step (so can be negative): 9735f757f3fSDimitry Andric CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); 97406c3fb27SDimitry Andric // First, calc corresponding iteration in every original loop: 97506c3fb27SDimitry Andric for (kmp_index_t ind = 0; ind < n; ++ind) { 97606c3fb27SDimitry Andric auto bounds = &(original_bounds_nest[ind]); 97706c3fb27SDimitry Andric iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind); 97806c3fb27SDimitry Andric } 97906c3fb27SDimitry Andric 98006c3fb27SDimitry Andric for (kmp_index_t ind = 0; ind < n; ++ind) { 98106c3fb27SDimitry Andric next_original_ivs[ind] = original_ivs[ind]; 98206c3fb27SDimitry Andric } 98306c3fb27SDimitry Andric 98406c3fb27SDimitry Andric // Next add one step to the iterations on the inner-most level, and see if we 98506c3fb27SDimitry Andric // need to move up the nest: 98606c3fb27SDimitry Andric kmp_index_t ind = n - 1; 98706c3fb27SDimitry Andric ++iterations[ind]; 98806c3fb27SDimitry Andric 98906c3fb27SDimitry Andric bool b = kmp_calc_original_ivs_from_iterations( 99006c3fb27SDimitry Andric original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind); 99106c3fb27SDimitry Andric 99206c3fb27SDimitry Andric return b; 99306c3fb27SDimitry Andric } 99406c3fb27SDimitry Andric 99506c3fb27SDimitry Andric //----------Calculate chunk end in the original loop space-------------------- 99606c3fb27SDimitry Andric 99706c3fb27SDimitry Andric // For one level calculate old induction variable corresponding to overall 99806c3fb27SDimitry Andric // new_iv for the chunk end. 99906c3fb27SDimitry Andric // Return true if it fits into upper bound on this level 100006c3fb27SDimitry Andric // (if not, we need to re-calculate) 100106c3fb27SDimitry Andric template <typename T> 100206c3fb27SDimitry Andric bool kmp_calc_one_iv_for_chunk_end_XX( 100306c3fb27SDimitry Andric const bounds_infoXX_template<T> *bounds, 100406c3fb27SDimitry Andric const bounds_infoXX_template<T> *updated_bounds, 100506c3fb27SDimitry Andric /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations, 100606c3fb27SDimitry Andric kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start, 100706c3fb27SDimitry Andric const kmp_point_t original_ivs_start) { 100806c3fb27SDimitry Andric 100906c3fb27SDimitry Andric // typedef std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64> 101006c3fb27SDimitry Andric // big_span_t; 101106c3fb27SDimitry Andric 101206c3fb27SDimitry Andric // OMPTODO: is it good enough, or do we need ST or do we need big_span_t? 101306c3fb27SDimitry Andric T temp = 0; 101406c3fb27SDimitry Andric 101506c3fb27SDimitry Andric T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]); 101606c3fb27SDimitry Andric 101706c3fb27SDimitry Andric if (start_with_lower_bound) { 101806c3fb27SDimitry Andric // we moved to the next iteration on one of outer loops, may as well use 101906c3fb27SDimitry Andric // the lower bound here: 102006c3fb27SDimitry Andric temp = bounds->lb0 + bounds->lb1 * outer_iv; 102106c3fb27SDimitry Andric } else { 102206c3fb27SDimitry Andric // Start in expanded space, but: 102306c3fb27SDimitry Andric // - we need to hit original space lower bound, so need to account for 102406c3fb27SDimitry Andric // that 102506c3fb27SDimitry Andric // - we have to go into original space, even if that means adding more 102606c3fb27SDimitry Andric // iterations than was planned 102706c3fb27SDimitry Andric // - we have to go past (or equal to) previous point (which is the chunk 102806c3fb27SDimitry Andric // starting point) 102906c3fb27SDimitry Andric 103006c3fb27SDimitry Andric auto iteration = iterations[ind]; 103106c3fb27SDimitry Andric 103206c3fb27SDimitry Andric auto step = bounds->step; 103306c3fb27SDimitry Andric 103406c3fb27SDimitry Andric // In case of >= it's negative: 103506c3fb27SDimitry Andric auto accountForStep = 103606c3fb27SDimitry Andric ((bounds->lb0 + bounds->lb1 * outer_iv) - 103706c3fb27SDimitry Andric (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) % 103806c3fb27SDimitry Andric step; 103906c3fb27SDimitry Andric 104006c3fb27SDimitry Andric temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv + 104106c3fb27SDimitry Andric accountForStep + iteration * step; 104206c3fb27SDimitry Andric 104306c3fb27SDimitry Andric if (((bounds->comparison == comparison_t::comp_less_or_eq) && 104406c3fb27SDimitry Andric (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) || 104506c3fb27SDimitry Andric ((bounds->comparison == comparison_t::comp_greater_or_eq) && 104606c3fb27SDimitry Andric (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) { 104706c3fb27SDimitry Andric // Too small (or too big), didn't reach the original lower bound. Use 104806c3fb27SDimitry Andric // heuristic: 104906c3fb27SDimitry Andric temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step; 105006c3fb27SDimitry Andric } 105106c3fb27SDimitry Andric 105206c3fb27SDimitry Andric if (compare_with_start) { 105306c3fb27SDimitry Andric 105406c3fb27SDimitry Andric T start = static_cast<T>(original_ivs_start[ind]); 105506c3fb27SDimitry Andric 105606c3fb27SDimitry Andric temp = kmp_fix_iv(bounds->loop_iv_type, temp); 105706c3fb27SDimitry Andric 105806c3fb27SDimitry Andric // On all previous levels start of the chunk is same as the end, need to 105906c3fb27SDimitry Andric // be really careful here: 106006c3fb27SDimitry Andric if (((bounds->comparison == comparison_t::comp_less_or_eq) && 106106c3fb27SDimitry Andric (temp < start)) || 106206c3fb27SDimitry Andric ((bounds->comparison == comparison_t::comp_greater_or_eq) && 106306c3fb27SDimitry Andric (temp > start))) { 106406c3fb27SDimitry Andric // End of the chunk can't be smaller (for >= bigger) than it's start. 106506c3fb27SDimitry Andric // Use heuristic: 106606c3fb27SDimitry Andric temp = start + iteration / 4 * step; 106706c3fb27SDimitry Andric } 106806c3fb27SDimitry Andric } 106906c3fb27SDimitry Andric } 107006c3fb27SDimitry Andric 107106c3fb27SDimitry Andric original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp); 107206c3fb27SDimitry Andric 107306c3fb27SDimitry Andric if (((bounds->comparison == comparison_t::comp_less_or_eq) && 107406c3fb27SDimitry Andric (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) || 107506c3fb27SDimitry Andric ((bounds->comparison == comparison_t::comp_greater_or_eq) && 107606c3fb27SDimitry Andric (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) { 107706c3fb27SDimitry Andric // Too big (or too small for >=). 107806c3fb27SDimitry Andric return false; 107906c3fb27SDimitry Andric } 108006c3fb27SDimitry Andric 108106c3fb27SDimitry Andric return true; 108206c3fb27SDimitry Andric } 108306c3fb27SDimitry Andric 108406c3fb27SDimitry Andric // For one level calculate old induction variable corresponding to overall 108506c3fb27SDimitry Andric // new_iv for the chunk end. 108606c3fb27SDimitry Andric bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds, 108706c3fb27SDimitry Andric const bounds_info_t *updated_bounds, 108806c3fb27SDimitry Andric /*in/out*/ kmp_point_t original_ivs, 108906c3fb27SDimitry Andric const kmp_iterations_t iterations, 109006c3fb27SDimitry Andric kmp_index_t ind, bool start_with_lower_bound, 109106c3fb27SDimitry Andric bool compare_with_start, 109206c3fb27SDimitry Andric const kmp_point_t original_ivs_start) { 109306c3fb27SDimitry Andric 109406c3fb27SDimitry Andric switch (bounds->loop_type) { 109506c3fb27SDimitry Andric case loop_type_t::loop_type_int32: 109606c3fb27SDimitry Andric return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>( 109706c3fb27SDimitry Andric (bounds_infoXX_template<kmp_int32> *)(bounds), 109806c3fb27SDimitry Andric (bounds_infoXX_template<kmp_int32> *)(updated_bounds), 109906c3fb27SDimitry Andric /*in/out*/ 110006c3fb27SDimitry Andric original_ivs, iterations, ind, start_with_lower_bound, 110106c3fb27SDimitry Andric compare_with_start, original_ivs_start); 110206c3fb27SDimitry Andric break; 110306c3fb27SDimitry Andric case loop_type_t::loop_type_uint32: 110406c3fb27SDimitry Andric return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>( 110506c3fb27SDimitry Andric (bounds_infoXX_template<kmp_uint32> *)(bounds), 110606c3fb27SDimitry Andric (bounds_infoXX_template<kmp_uint32> *)(updated_bounds), 110706c3fb27SDimitry Andric /*in/out*/ 110806c3fb27SDimitry Andric original_ivs, iterations, ind, start_with_lower_bound, 110906c3fb27SDimitry Andric compare_with_start, original_ivs_start); 111006c3fb27SDimitry Andric break; 111106c3fb27SDimitry Andric case loop_type_t::loop_type_int64: 111206c3fb27SDimitry Andric return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>( 111306c3fb27SDimitry Andric (bounds_infoXX_template<kmp_int64> *)(bounds), 111406c3fb27SDimitry Andric (bounds_infoXX_template<kmp_int64> *)(updated_bounds), 111506c3fb27SDimitry Andric /*in/out*/ 111606c3fb27SDimitry Andric original_ivs, iterations, ind, start_with_lower_bound, 111706c3fb27SDimitry Andric compare_with_start, original_ivs_start); 111806c3fb27SDimitry Andric break; 111906c3fb27SDimitry Andric case loop_type_t::loop_type_uint64: 112006c3fb27SDimitry Andric return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>( 112106c3fb27SDimitry Andric (bounds_infoXX_template<kmp_uint64> *)(bounds), 112206c3fb27SDimitry Andric (bounds_infoXX_template<kmp_uint64> *)(updated_bounds), 112306c3fb27SDimitry Andric /*in/out*/ 112406c3fb27SDimitry Andric original_ivs, iterations, ind, start_with_lower_bound, 112506c3fb27SDimitry Andric compare_with_start, original_ivs_start); 112606c3fb27SDimitry Andric break; 112706c3fb27SDimitry Andric default: 112806c3fb27SDimitry Andric KMP_ASSERT(false); 112906c3fb27SDimitry Andric return false; 113006c3fb27SDimitry Andric } 113106c3fb27SDimitry Andric } 113206c3fb27SDimitry Andric 113306c3fb27SDimitry Andric // Calculate old induction variables corresponding to overall new_iv for the 113406c3fb27SDimitry Andric // chunk end. If due to space extension we are getting old IVs outside of the 113506c3fb27SDimitry Andric // boundaries, bring them into the boundaries. Need to do this in the runtime, 113606c3fb27SDimitry Andric // esp. on the lower bounds side. When getting result need to make sure that the 113706c3fb27SDimitry Andric // new chunk starts at next position to old chunk, not overlaps with it (this is 113806c3fb27SDimitry Andric // done elsewhere), and need to make sure end of the chunk is further than the 113906c3fb27SDimitry Andric // beginning of the chunk. We don't need an exact ending point here, just 114006c3fb27SDimitry Andric // something more-or-less close to the desired chunk length, bigger is fine 114106c3fb27SDimitry Andric // (smaller would be fine, but we risk going into infinite loop, so do smaller 114206c3fb27SDimitry Andric // only at the very end of the space). result: false if could not find the 114306c3fb27SDimitry Andric // ending point in the original loop space. In this case the caller can use 114406c3fb27SDimitry Andric // original upper bounds as the end of the chunk. Chunk won't be empty, because 114506c3fb27SDimitry Andric // it'll have at least the starting point, which is by construction in the 114606c3fb27SDimitry Andric // original space. 114706c3fb27SDimitry Andric bool kmp_calc_original_ivs_for_chunk_end( 114806c3fb27SDimitry Andric const bounds_info_t *original_bounds_nest, kmp_index_t n, 114906c3fb27SDimitry Andric const bounds_info_internal_t *updated_bounds_nest, 115006c3fb27SDimitry Andric const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv, 115106c3fb27SDimitry Andric /*out*/ kmp_point_t original_ivs) { 115206c3fb27SDimitry Andric 115306c3fb27SDimitry Andric // Iterations in the expanded space: 11545f757f3fSDimitry Andric CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); 115506c3fb27SDimitry Andric // First, calc corresponding iteration in every modified loop: 115606c3fb27SDimitry Andric for (kmp_index_t ind = n; ind > 0;) { 115706c3fb27SDimitry Andric --ind; 115806c3fb27SDimitry Andric auto &updated_bounds = updated_bounds_nest[ind]; 115906c3fb27SDimitry Andric 116006c3fb27SDimitry Andric // should be optimized to OPDIVREM: 116106c3fb27SDimitry Andric auto new_ind = new_iv / updated_bounds.b.trip_count; 116206c3fb27SDimitry Andric auto iteration = new_iv % updated_bounds.b.trip_count; 116306c3fb27SDimitry Andric 116406c3fb27SDimitry Andric new_iv = new_ind; 116506c3fb27SDimitry Andric iterations[ind] = iteration; 116606c3fb27SDimitry Andric } 116706c3fb27SDimitry Andric KMP_DEBUG_ASSERT(new_iv == 0); 116806c3fb27SDimitry Andric 116906c3fb27SDimitry Andric kmp_index_t lengthened_ind = n; 117006c3fb27SDimitry Andric kmp_index_t equal_ind = -1; 117106c3fb27SDimitry Andric 117206c3fb27SDimitry Andric // Next calculate the point, but in original loop nest. 117306c3fb27SDimitry Andric for (kmp_index_t ind = 0; ind < n;) { 117406c3fb27SDimitry Andric auto bounds = &(original_bounds_nest[ind]); 117506c3fb27SDimitry Andric auto updated_bounds = &(updated_bounds_nest[ind].b); 117606c3fb27SDimitry Andric 117706c3fb27SDimitry Andric bool good = kmp_calc_one_iv_for_chunk_end( 117806c3fb27SDimitry Andric bounds, updated_bounds, 117906c3fb27SDimitry Andric /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind), 118006c3fb27SDimitry Andric (equal_ind >= ind - 1), original_ivs_start); 118106c3fb27SDimitry Andric 118206c3fb27SDimitry Andric if (!good) { 118306c3fb27SDimitry Andric // Too big (or too small for >=). 118406c3fb27SDimitry Andric if (ind == 0) { 118506c3fb27SDimitry Andric // Need to reduce to the end. 118606c3fb27SDimitry Andric return false; 118706c3fb27SDimitry Andric } else { 118806c3fb27SDimitry Andric // Go to next iteration on outer loop: 118906c3fb27SDimitry Andric --ind; 119006c3fb27SDimitry Andric ++(iterations[ind]); 119106c3fb27SDimitry Andric lengthened_ind = ind; 119206c3fb27SDimitry Andric if (equal_ind >= lengthened_ind) { 119306c3fb27SDimitry Andric // We've changed the number of iterations here, 119406c3fb27SDimitry Andric // can't be same anymore: 119506c3fb27SDimitry Andric equal_ind = lengthened_ind - 1; 119606c3fb27SDimitry Andric } 119706c3fb27SDimitry Andric for (kmp_index_t i = ind + 1; i < n; ++i) { 119806c3fb27SDimitry Andric iterations[i] = 0; 119906c3fb27SDimitry Andric } 120006c3fb27SDimitry Andric continue; 120106c3fb27SDimitry Andric } 120206c3fb27SDimitry Andric } 120306c3fb27SDimitry Andric 120406c3fb27SDimitry Andric if ((equal_ind == ind - 1) && 120506c3fb27SDimitry Andric (kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind], 120606c3fb27SDimitry Andric original_ivs_start[ind]))) { 120706c3fb27SDimitry Andric equal_ind = ind; 120806c3fb27SDimitry Andric } else if ((equal_ind > ind - 1) && 120906c3fb27SDimitry Andric !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind], 121006c3fb27SDimitry Andric original_ivs_start[ind]))) { 121106c3fb27SDimitry Andric equal_ind = ind - 1; 121206c3fb27SDimitry Andric } 121306c3fb27SDimitry Andric ++ind; 121406c3fb27SDimitry Andric } 121506c3fb27SDimitry Andric 121606c3fb27SDimitry Andric return true; 121706c3fb27SDimitry Andric } 121806c3fb27SDimitry Andric 121906c3fb27SDimitry Andric //----------Calculate upper bounds for the last chunk------------------------- 122006c3fb27SDimitry Andric 122106c3fb27SDimitry Andric // Calculate one upper bound for the end. 122206c3fb27SDimitry Andric template <typename T> 122306c3fb27SDimitry Andric void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds, 122406c3fb27SDimitry Andric /*in/out*/ kmp_point_t original_ivs, 122506c3fb27SDimitry Andric kmp_index_t ind) { 122606c3fb27SDimitry Andric 122706c3fb27SDimitry Andric T temp = bounds->ub0 + 122806c3fb27SDimitry Andric bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]); 122906c3fb27SDimitry Andric 123006c3fb27SDimitry Andric original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp); 123106c3fb27SDimitry Andric } 123206c3fb27SDimitry Andric 123306c3fb27SDimitry Andric void kmp_calc_one_iv_end(const bounds_info_t *bounds, 123406c3fb27SDimitry Andric /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) { 123506c3fb27SDimitry Andric 123606c3fb27SDimitry Andric switch (bounds->loop_type) { 123706c3fb27SDimitry Andric default: 123806c3fb27SDimitry Andric KMP_ASSERT(false); 123906c3fb27SDimitry Andric break; 124006c3fb27SDimitry Andric case loop_type_t::loop_type_int32: 124106c3fb27SDimitry Andric kmp_calc_one_iv_end_XX<kmp_int32>( 124206c3fb27SDimitry Andric (bounds_infoXX_template<kmp_int32> *)(bounds), 124306c3fb27SDimitry Andric /*in/out*/ original_ivs, ind); 124406c3fb27SDimitry Andric break; 124506c3fb27SDimitry Andric case loop_type_t::loop_type_uint32: 124606c3fb27SDimitry Andric kmp_calc_one_iv_end_XX<kmp_uint32>( 124706c3fb27SDimitry Andric (bounds_infoXX_template<kmp_uint32> *)(bounds), 124806c3fb27SDimitry Andric /*in/out*/ original_ivs, ind); 124906c3fb27SDimitry Andric break; 125006c3fb27SDimitry Andric case loop_type_t::loop_type_int64: 125106c3fb27SDimitry Andric kmp_calc_one_iv_end_XX<kmp_int64>( 125206c3fb27SDimitry Andric (bounds_infoXX_template<kmp_int64> *)(bounds), 125306c3fb27SDimitry Andric /*in/out*/ original_ivs, ind); 125406c3fb27SDimitry Andric break; 125506c3fb27SDimitry Andric case loop_type_t::loop_type_uint64: 125606c3fb27SDimitry Andric kmp_calc_one_iv_end_XX<kmp_uint64>( 125706c3fb27SDimitry Andric (bounds_infoXX_template<kmp_uint64> *)(bounds), 125806c3fb27SDimitry Andric /*in/out*/ original_ivs, ind); 125906c3fb27SDimitry Andric break; 126006c3fb27SDimitry Andric } 126106c3fb27SDimitry Andric } 126206c3fb27SDimitry Andric 126306c3fb27SDimitry Andric // Calculate upper bounds for the last loop iteration. Just use original upper 126406c3fb27SDimitry Andric // bounds (adjusted when canonicalized to use <= / >=). No need to check that 126506c3fb27SDimitry Andric // this point is in the original space (it's likely not) 126606c3fb27SDimitry Andric void kmp_calc_original_ivs_for_end( 126706c3fb27SDimitry Andric const bounds_info_t *const original_bounds_nest, kmp_index_t n, 126806c3fb27SDimitry Andric /*out*/ kmp_point_t original_ivs) { 126906c3fb27SDimitry Andric for (kmp_index_t ind = 0; ind < n; ++ind) { 127006c3fb27SDimitry Andric auto bounds = &(original_bounds_nest[ind]); 127106c3fb27SDimitry Andric kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind); 127206c3fb27SDimitry Andric } 127306c3fb27SDimitry Andric } 127406c3fb27SDimitry Andric 1275*0fca6ea1SDimitry Andric /************************************************************************** 1276*0fca6ea1SDimitry Andric * Identify nested loop structure - loops come in the canonical form 1277*0fca6ea1SDimitry Andric * Lower triangle matrix: i = 0; i <= N; i++ {0,0}:{N,0} 1278*0fca6ea1SDimitry Andric * j = 0; j <= 0/-1+1*i; j++ {0,0}:{0/-1,1} 1279*0fca6ea1SDimitry Andric * Upper Triangle matrix 1280*0fca6ea1SDimitry Andric * i = 0; i <= N; i++ {0,0}:{N,0} 1281*0fca6ea1SDimitry Andric * j = 0+1*i; j <= N; j++ {0,1}:{N,0} 1282*0fca6ea1SDimitry Andric * ************************************************************************/ 1283*0fca6ea1SDimitry Andric nested_loop_type_t 1284*0fca6ea1SDimitry Andric kmp_identify_nested_loop_structure(/*in*/ bounds_info_t *original_bounds_nest, 1285*0fca6ea1SDimitry Andric /*in*/ kmp_index_t n) { 1286*0fca6ea1SDimitry Andric // only 2-level nested loops are supported 1287*0fca6ea1SDimitry Andric if (n != 2) { 1288*0fca6ea1SDimitry Andric return nested_loop_type_unkown; 1289*0fca6ea1SDimitry Andric } 1290*0fca6ea1SDimitry Andric // loops must be canonical 1291*0fca6ea1SDimitry Andric KMP_ASSERT( 1292*0fca6ea1SDimitry Andric (original_bounds_nest[0].comparison == comparison_t::comp_less_or_eq) && 1293*0fca6ea1SDimitry Andric (original_bounds_nest[1].comparison == comparison_t::comp_less_or_eq)); 1294*0fca6ea1SDimitry Andric // check outer loop bounds: for triangular need to be {0,0}:{N,0} 1295*0fca6ea1SDimitry Andric kmp_uint64 outer_lb0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, 1296*0fca6ea1SDimitry Andric original_bounds_nest[0].lb0_u64); 1297*0fca6ea1SDimitry Andric kmp_uint64 outer_ub0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, 1298*0fca6ea1SDimitry Andric original_bounds_nest[0].ub0_u64); 1299*0fca6ea1SDimitry Andric kmp_uint64 outer_lb1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, 1300*0fca6ea1SDimitry Andric original_bounds_nest[0].lb1_u64); 1301*0fca6ea1SDimitry Andric kmp_uint64 outer_ub1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, 1302*0fca6ea1SDimitry Andric original_bounds_nest[0].ub1_u64); 1303*0fca6ea1SDimitry Andric if (outer_lb0_u64 != 0 || outer_lb1_u64 != 0 || outer_ub1_u64 != 0) { 1304*0fca6ea1SDimitry Andric return nested_loop_type_unkown; 1305*0fca6ea1SDimitry Andric } 1306*0fca6ea1SDimitry Andric // check inner bounds to determine triangle type 1307*0fca6ea1SDimitry Andric kmp_uint64 inner_lb0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, 1308*0fca6ea1SDimitry Andric original_bounds_nest[1].lb0_u64); 1309*0fca6ea1SDimitry Andric kmp_uint64 inner_ub0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, 1310*0fca6ea1SDimitry Andric original_bounds_nest[1].ub0_u64); 1311*0fca6ea1SDimitry Andric kmp_uint64 inner_lb1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, 1312*0fca6ea1SDimitry Andric original_bounds_nest[1].lb1_u64); 1313*0fca6ea1SDimitry Andric kmp_uint64 inner_ub1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, 1314*0fca6ea1SDimitry Andric original_bounds_nest[1].ub1_u64); 1315*0fca6ea1SDimitry Andric // lower triangle loop inner bounds need to be {0,0}:{0/-1,1} 1316*0fca6ea1SDimitry Andric if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 && 1317*0fca6ea1SDimitry Andric (inner_ub0_u64 == 0 || inner_ub0_u64 == -1) && inner_ub1_u64 == 1) { 1318*0fca6ea1SDimitry Andric return nested_loop_type_lower_triangular_matrix; 1319*0fca6ea1SDimitry Andric } 1320*0fca6ea1SDimitry Andric // upper triangle loop inner bounds need to be {0,1}:{N,0} 1321*0fca6ea1SDimitry Andric if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 && 1322*0fca6ea1SDimitry Andric inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0) { 1323*0fca6ea1SDimitry Andric return nested_loop_type_upper_triangular_matrix; 1324*0fca6ea1SDimitry Andric } 1325*0fca6ea1SDimitry Andric return nested_loop_type_unkown; 1326*0fca6ea1SDimitry Andric } 1327*0fca6ea1SDimitry Andric 1328*0fca6ea1SDimitry Andric /************************************************************************** 1329*0fca6ea1SDimitry Andric * SQRT Approximation: https://math.mit.edu/~stevenj/18.335/newton-sqrt.pdf 1330*0fca6ea1SDimitry Andric * Start point is x so the result is always > sqrt(x) 1331*0fca6ea1SDimitry Andric * The method has uniform convergence, PRECISION is set to 0.1 1332*0fca6ea1SDimitry Andric * ************************************************************************/ 1333*0fca6ea1SDimitry Andric #define level_of_precision 0.1 1334*0fca6ea1SDimitry Andric double sqrt_newton_approx(/*in*/ kmp_uint64 x) { 1335*0fca6ea1SDimitry Andric double sqrt_old = 0.; 1336*0fca6ea1SDimitry Andric double sqrt_new = (double)x; 1337*0fca6ea1SDimitry Andric do { 1338*0fca6ea1SDimitry Andric sqrt_old = sqrt_new; 1339*0fca6ea1SDimitry Andric sqrt_new = (sqrt_old + x / sqrt_old) / 2; 1340*0fca6ea1SDimitry Andric } while ((sqrt_old - sqrt_new) > level_of_precision); 1341*0fca6ea1SDimitry Andric return sqrt_new; 1342*0fca6ea1SDimitry Andric } 1343*0fca6ea1SDimitry Andric 1344*0fca6ea1SDimitry Andric /************************************************************************** 1345*0fca6ea1SDimitry Andric * Handle lower triangle matrix in the canonical form 1346*0fca6ea1SDimitry Andric * i = 0; i <= N; i++ {0,0}:{N,0} 1347*0fca6ea1SDimitry Andric * j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1} 1348*0fca6ea1SDimitry Andric * ************************************************************************/ 1349*0fca6ea1SDimitry Andric void kmp_handle_lower_triangle_matrix( 1350*0fca6ea1SDimitry Andric /*in*/ kmp_uint32 nth, 1351*0fca6ea1SDimitry Andric /*in*/ kmp_uint32 tid, 1352*0fca6ea1SDimitry Andric /*in */ kmp_index_t n, 1353*0fca6ea1SDimitry Andric /*in/out*/ bounds_info_t *original_bounds_nest, 1354*0fca6ea1SDimitry Andric /*out*/ bounds_info_t *chunk_bounds_nest) { 1355*0fca6ea1SDimitry Andric 1356*0fca6ea1SDimitry Andric // transfer loop types from the original loop to the chunks 1357*0fca6ea1SDimitry Andric for (kmp_index_t i = 0; i < n; ++i) { 1358*0fca6ea1SDimitry Andric chunk_bounds_nest[i] = original_bounds_nest[i]; 1359*0fca6ea1SDimitry Andric } 1360*0fca6ea1SDimitry Andric // cleanup iv variables 1361*0fca6ea1SDimitry Andric kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, 1362*0fca6ea1SDimitry Andric original_bounds_nest[0].ub0_u64); 1363*0fca6ea1SDimitry Andric kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, 1364*0fca6ea1SDimitry Andric original_bounds_nest[0].lb0_u64); 1365*0fca6ea1SDimitry Andric kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, 1366*0fca6ea1SDimitry Andric original_bounds_nest[1].ub0_u64); 1367*0fca6ea1SDimitry Andric // calculate the chunk's lower and upper bounds 1368*0fca6ea1SDimitry Andric // the total number of iterations in the loop is the sum of the arithmetic 1369*0fca6ea1SDimitry Andric // progression from the outer lower to outer upper bound (inclusive since the 1370*0fca6ea1SDimitry Andric // loop is canonical) note that less_than inner loops (inner_ub0 = -1) 1371*0fca6ea1SDimitry Andric // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0 1372*0fca6ea1SDimitry Andric // + 1) -> N - 1 1373*0fca6ea1SDimitry Andric kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1) + inner_ub0; 1374*0fca6ea1SDimitry Andric kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2; 1375*0fca6ea1SDimitry Andric // the current thread's number of iterations: 1376*0fca6ea1SDimitry Andric // each thread gets an equal number of iterations: total number of iterations 1377*0fca6ea1SDimitry Andric // divided by the number of threads plus, if there's a remainder, 1378*0fca6ea1SDimitry Andric // the first threads with the number up to the remainder get an additional 1379*0fca6ea1SDimitry Andric // iteration each to cover it 1380*0fca6ea1SDimitry Andric kmp_uint64 iter_current = 1381*0fca6ea1SDimitry Andric iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0); 1382*0fca6ea1SDimitry Andric // cumulative number of iterations executed by all the previous threads: 1383*0fca6ea1SDimitry Andric // threads with the tid below the remainder will have (iter_total/nth+1) 1384*0fca6ea1SDimitry Andric // elements, and so will all threads before them so the cumulative number of 1385*0fca6ea1SDimitry Andric // iterations executed by the all previous will be the current thread's number 1386*0fca6ea1SDimitry Andric // of iterations multiplied by the number of previous threads which is equal 1387*0fca6ea1SDimitry Andric // to the current thread's tid; threads with the number equal or above the 1388*0fca6ea1SDimitry Andric // remainder will have (iter_total/nth) elements so the cumulative number of 1389*0fca6ea1SDimitry Andric // iterations previously executed is its number of iterations multipled by the 1390*0fca6ea1SDimitry Andric // number of previous threads which is again equal to the current thread's tid 1391*0fca6ea1SDimitry Andric // PLUS all the remainder iterations that will have been executed by the 1392*0fca6ea1SDimitry Andric // previous threads 1393*0fca6ea1SDimitry Andric kmp_uint64 iter_before_current = 1394*0fca6ea1SDimitry Andric tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth)); 1395*0fca6ea1SDimitry Andric // cumulative number of iterations executed with the current thread is 1396*0fca6ea1SDimitry Andric // the cumulative number executed before it plus its own 1397*0fca6ea1SDimitry Andric kmp_uint64 iter_with_current = iter_before_current + iter_current; 1398*0fca6ea1SDimitry Andric // calculate the outer loop lower bound (lbo) which is the max outer iv value 1399*0fca6ea1SDimitry Andric // that gives the number of iterations that is equal or just below the total 1400*0fca6ea1SDimitry Andric // number of iterations executed by the previous threads, for less_than 1401*0fca6ea1SDimitry Andric // (1-based) inner loops (inner_ub0 == -1) it will be i.e. 1402*0fca6ea1SDimitry Andric // lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0 1403*0fca6ea1SDimitry Andric // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be: 1404*0fca6ea1SDimitry Andric // i.e. lbo*(lbo+1)/2<=iter_before_current => 1405*0fca6ea1SDimitry Andric // lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily 1406*0fca6ea1SDimitry Andric // using a parameter to control the equation sign 1407*0fca6ea1SDimitry Andric kmp_int64 inner_adjustment = 1 + 2 * inner_ub0; 1408*0fca6ea1SDimitry Andric kmp_uint64 lower_bound_outer = 1409*0fca6ea1SDimitry Andric (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment + 1410*0fca6ea1SDimitry Andric 8 * iter_before_current) + 1411*0fca6ea1SDimitry Andric inner_adjustment) / 1412*0fca6ea1SDimitry Andric 2 - 1413*0fca6ea1SDimitry Andric inner_adjustment; 1414*0fca6ea1SDimitry Andric // calculate the inner loop lower bound which is the remaining number of 1415*0fca6ea1SDimitry Andric // iterations required to hit the total number of iterations executed by the 1416*0fca6ea1SDimitry Andric // previous threads giving the starting point of this thread 1417*0fca6ea1SDimitry Andric kmp_uint64 lower_bound_inner = 1418*0fca6ea1SDimitry Andric iter_before_current - 1419*0fca6ea1SDimitry Andric ((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2; 1420*0fca6ea1SDimitry Andric // calculate the outer loop upper bound using the same approach as for the 1421*0fca6ea1SDimitry Andric // inner bound except using the total number of iterations executed with the 1422*0fca6ea1SDimitry Andric // current thread 1423*0fca6ea1SDimitry Andric kmp_uint64 upper_bound_outer = 1424*0fca6ea1SDimitry Andric (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment + 1425*0fca6ea1SDimitry Andric 8 * iter_with_current) + 1426*0fca6ea1SDimitry Andric inner_adjustment) / 1427*0fca6ea1SDimitry Andric 2 - 1428*0fca6ea1SDimitry Andric inner_adjustment; 1429*0fca6ea1SDimitry Andric // calculate the inner loop upper bound which is the remaining number of 1430*0fca6ea1SDimitry Andric // iterations required to hit the total number of iterations executed after 1431*0fca6ea1SDimitry Andric // the current thread giving the starting point of the next thread 1432*0fca6ea1SDimitry Andric kmp_uint64 upper_bound_inner = 1433*0fca6ea1SDimitry Andric iter_with_current - 1434*0fca6ea1SDimitry Andric ((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2; 1435*0fca6ea1SDimitry Andric // adjust the upper bounds down by 1 element to point at the last iteration of 1436*0fca6ea1SDimitry Andric // the current thread the first iteration of the next thread 1437*0fca6ea1SDimitry Andric if (upper_bound_inner == 0) { 1438*0fca6ea1SDimitry Andric // {n,0} => {n-1,n-1} 1439*0fca6ea1SDimitry Andric upper_bound_outer -= 1; 1440*0fca6ea1SDimitry Andric upper_bound_inner = upper_bound_outer; 1441*0fca6ea1SDimitry Andric } else { 1442*0fca6ea1SDimitry Andric // {n,m} => {n,m-1} (m!=0) 1443*0fca6ea1SDimitry Andric upper_bound_inner -= 1; 1444*0fca6ea1SDimitry Andric } 1445*0fca6ea1SDimitry Andric 1446*0fca6ea1SDimitry Andric // assign the values, zeroing out lb1 and ub1 values since the iteration space 1447*0fca6ea1SDimitry Andric // is now one-dimensional 1448*0fca6ea1SDimitry Andric chunk_bounds_nest[0].lb0_u64 = lower_bound_outer; 1449*0fca6ea1SDimitry Andric chunk_bounds_nest[1].lb0_u64 = lower_bound_inner; 1450*0fca6ea1SDimitry Andric chunk_bounds_nest[0].ub0_u64 = upper_bound_outer; 1451*0fca6ea1SDimitry Andric chunk_bounds_nest[1].ub0_u64 = upper_bound_inner; 1452*0fca6ea1SDimitry Andric chunk_bounds_nest[0].lb1_u64 = 0; 1453*0fca6ea1SDimitry Andric chunk_bounds_nest[0].ub1_u64 = 0; 1454*0fca6ea1SDimitry Andric chunk_bounds_nest[1].lb1_u64 = 0; 1455*0fca6ea1SDimitry Andric chunk_bounds_nest[1].ub1_u64 = 0; 1456*0fca6ea1SDimitry Andric 1457*0fca6ea1SDimitry Andric #if 0 1458*0fca6ea1SDimitry Andric printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n", 1459*0fca6ea1SDimitry Andric tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64, 1460*0fca6ea1SDimitry Andric chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total); 1461*0fca6ea1SDimitry Andric #endif 1462*0fca6ea1SDimitry Andric } 1463*0fca6ea1SDimitry Andric 1464*0fca6ea1SDimitry Andric /************************************************************************** 1465*0fca6ea1SDimitry Andric * Handle upper triangle matrix in the canonical form 1466*0fca6ea1SDimitry Andric * i = 0; i <= N; i++ {0,0}:{N,0} 1467*0fca6ea1SDimitry Andric * j = 0+1*i; j <= N; j++ {0,1}:{N,0} 1468*0fca6ea1SDimitry Andric * ************************************************************************/ 1469*0fca6ea1SDimitry Andric void kmp_handle_upper_triangle_matrix( 1470*0fca6ea1SDimitry Andric /*in*/ kmp_uint32 nth, 1471*0fca6ea1SDimitry Andric /*in*/ kmp_uint32 tid, 1472*0fca6ea1SDimitry Andric /*in */ kmp_index_t n, 1473*0fca6ea1SDimitry Andric /*in/out*/ bounds_info_t *original_bounds_nest, 1474*0fca6ea1SDimitry Andric /*out*/ bounds_info_t *chunk_bounds_nest) { 1475*0fca6ea1SDimitry Andric 1476*0fca6ea1SDimitry Andric // transfer loop types from the original loop to the chunks 1477*0fca6ea1SDimitry Andric for (kmp_index_t i = 0; i < n; ++i) { 1478*0fca6ea1SDimitry Andric chunk_bounds_nest[i] = original_bounds_nest[i]; 1479*0fca6ea1SDimitry Andric } 1480*0fca6ea1SDimitry Andric // cleanup iv variables 1481*0fca6ea1SDimitry Andric kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, 1482*0fca6ea1SDimitry Andric original_bounds_nest[0].ub0_u64); 1483*0fca6ea1SDimitry Andric kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, 1484*0fca6ea1SDimitry Andric original_bounds_nest[0].lb0_u64); 1485*0fca6ea1SDimitry Andric [[maybe_unused]] kmp_uint64 inner_ub0 = kmp_fix_iv( 1486*0fca6ea1SDimitry Andric original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].ub0_u64); 1487*0fca6ea1SDimitry Andric // calculate the chunk's lower and upper bounds 1488*0fca6ea1SDimitry Andric // the total number of iterations in the loop is the sum of the arithmetic 1489*0fca6ea1SDimitry Andric // progression from the outer lower to outer upper bound (inclusive since the 1490*0fca6ea1SDimitry Andric // loop is canonical) note that less_than inner loops (inner_ub0 = -1) 1491*0fca6ea1SDimitry Andric // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0 1492*0fca6ea1SDimitry Andric // + 1) -> N - 1 1493*0fca6ea1SDimitry Andric kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1); 1494*0fca6ea1SDimitry Andric kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2; 1495*0fca6ea1SDimitry Andric // the current thread's number of iterations: 1496*0fca6ea1SDimitry Andric // each thread gets an equal number of iterations: total number of iterations 1497*0fca6ea1SDimitry Andric // divided by the number of threads plus, if there's a remainder, 1498*0fca6ea1SDimitry Andric // the first threads with the number up to the remainder get an additional 1499*0fca6ea1SDimitry Andric // iteration each to cover it 1500*0fca6ea1SDimitry Andric kmp_uint64 iter_current = 1501*0fca6ea1SDimitry Andric iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0); 1502*0fca6ea1SDimitry Andric // cumulative number of iterations executed by all the previous threads: 1503*0fca6ea1SDimitry Andric // threads with the tid below the remainder will have (iter_total/nth+1) 1504*0fca6ea1SDimitry Andric // elements, and so will all threads before them so the cumulative number of 1505*0fca6ea1SDimitry Andric // iterations executed by the all previous will be the current thread's number 1506*0fca6ea1SDimitry Andric // of iterations multiplied by the number of previous threads which is equal 1507*0fca6ea1SDimitry Andric // to the current thread's tid; threads with the number equal or above the 1508*0fca6ea1SDimitry Andric // remainder will have (iter_total/nth) elements so the cumulative number of 1509*0fca6ea1SDimitry Andric // iterations previously executed is its number of iterations multipled by the 1510*0fca6ea1SDimitry Andric // number of previous threads which is again equal to the current thread's tid 1511*0fca6ea1SDimitry Andric // PLUS all the remainder iterations that will have been executed by the 1512*0fca6ea1SDimitry Andric // previous threads 1513*0fca6ea1SDimitry Andric kmp_uint64 iter_before_current = 1514*0fca6ea1SDimitry Andric tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth)); 1515*0fca6ea1SDimitry Andric // cumulative number of iterations executed with the current thread is 1516*0fca6ea1SDimitry Andric // the cumulative number executed before it plus its own 1517*0fca6ea1SDimitry Andric kmp_uint64 iter_with_current = iter_before_current + iter_current; 1518*0fca6ea1SDimitry Andric // calculate the outer loop lower bound (lbo) which is the max outer iv value 1519*0fca6ea1SDimitry Andric // that gives the number of iterations that is equal or just below the total 1520*0fca6ea1SDimitry Andric // number of iterations executed by the previous threads: 1521*0fca6ea1SDimitry Andric // lbo*(lbo+1)/2<=iter_before_current => 1522*0fca6ea1SDimitry Andric // lbo^2+lbo-2*iter_before_current<=0 1523*0fca6ea1SDimitry Andric kmp_uint64 lower_bound_outer = 1524*0fca6ea1SDimitry Andric (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1; 1525*0fca6ea1SDimitry Andric // calculate the inner loop lower bound which is the remaining number of 1526*0fca6ea1SDimitry Andric // iterations required to hit the total number of iterations executed by the 1527*0fca6ea1SDimitry Andric // previous threads giving the starting point of this thread 1528*0fca6ea1SDimitry Andric kmp_uint64 lower_bound_inner = 1529*0fca6ea1SDimitry Andric iter_before_current - ((lower_bound_outer + 1) * lower_bound_outer) / 2; 1530*0fca6ea1SDimitry Andric // calculate the outer loop upper bound using the same approach as for the 1531*0fca6ea1SDimitry Andric // inner bound except using the total number of iterations executed with the 1532*0fca6ea1SDimitry Andric // current thread 1533*0fca6ea1SDimitry Andric kmp_uint64 upper_bound_outer = 1534*0fca6ea1SDimitry Andric (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_with_current) + 1) / 2 - 1; 1535*0fca6ea1SDimitry Andric // calculate the inner loop upper bound which is the remaining number of 1536*0fca6ea1SDimitry Andric // iterations required to hit the total number of iterations executed after 1537*0fca6ea1SDimitry Andric // the current thread giving the starting point of the next thread 1538*0fca6ea1SDimitry Andric kmp_uint64 upper_bound_inner = 1539*0fca6ea1SDimitry Andric iter_with_current - ((upper_bound_outer + 1) * upper_bound_outer) / 2; 1540*0fca6ea1SDimitry Andric // adjust the upper bounds down by 1 element to point at the last iteration of 1541*0fca6ea1SDimitry Andric // the current thread the first iteration of the next thread 1542*0fca6ea1SDimitry Andric if (upper_bound_inner == 0) { 1543*0fca6ea1SDimitry Andric // {n,0} => {n-1,n-1} 1544*0fca6ea1SDimitry Andric upper_bound_outer -= 1; 1545*0fca6ea1SDimitry Andric upper_bound_inner = upper_bound_outer; 1546*0fca6ea1SDimitry Andric } else { 1547*0fca6ea1SDimitry Andric // {n,m} => {n,m-1} (m!=0) 1548*0fca6ea1SDimitry Andric upper_bound_inner -= 1; 1549*0fca6ea1SDimitry Andric } 1550*0fca6ea1SDimitry Andric 1551*0fca6ea1SDimitry Andric // assign the values, zeroing out lb1 and ub1 values since the iteration space 1552*0fca6ea1SDimitry Andric // is now one-dimensional 1553*0fca6ea1SDimitry Andric chunk_bounds_nest[0].lb0_u64 = (outer_iters - 1) - upper_bound_outer; 1554*0fca6ea1SDimitry Andric chunk_bounds_nest[1].lb0_u64 = (outer_iters - 1) - upper_bound_inner; 1555*0fca6ea1SDimitry Andric chunk_bounds_nest[0].ub0_u64 = (outer_iters - 1) - lower_bound_outer; 1556*0fca6ea1SDimitry Andric chunk_bounds_nest[1].ub0_u64 = (outer_iters - 1) - lower_bound_inner; 1557*0fca6ea1SDimitry Andric chunk_bounds_nest[0].lb1_u64 = 0; 1558*0fca6ea1SDimitry Andric chunk_bounds_nest[0].ub1_u64 = 0; 1559*0fca6ea1SDimitry Andric chunk_bounds_nest[1].lb1_u64 = 0; 1560*0fca6ea1SDimitry Andric chunk_bounds_nest[1].ub1_u64 = 0; 1561*0fca6ea1SDimitry Andric 1562*0fca6ea1SDimitry Andric #if 0 1563*0fca6ea1SDimitry Andric printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n", 1564*0fca6ea1SDimitry Andric tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64, 1565*0fca6ea1SDimitry Andric chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total); 1566*0fca6ea1SDimitry Andric #endif 1567*0fca6ea1SDimitry Andric } 156806c3fb27SDimitry Andric //----------Init API for non-rectangular loops-------------------------------- 156906c3fb27SDimitry Andric 157006c3fb27SDimitry Andric // Init API for collapsed loops (static, no chunks defined). 157106c3fb27SDimitry Andric // "bounds_nest" has to be allocated per thread. 157206c3fb27SDimitry Andric // API will modify original bounds_nest array to bring it to a canonical form 157306c3fb27SDimitry Andric // (only <= and >=, no !=, <, >). If the original loop nest was already in a 157406c3fb27SDimitry Andric // canonical form there will be no changes to bounds in bounds_nest array 157506c3fb27SDimitry Andric // (only trip counts will be calculated). Internally API will expand the space 157606c3fb27SDimitry Andric // to parallelogram/parallelepiped, calculate total, calculate bounds for the 157706c3fb27SDimitry Andric // chunks in terms of the new IV, re-calc them in terms of old IVs (especially 157806c3fb27SDimitry Andric // important on the left side, to hit the lower bounds and not step over), and 157906c3fb27SDimitry Andric // pick the correct chunk for this thread (so it will calculate chunks up to the 158006c3fb27SDimitry Andric // needed one). It could be optimized to calculate just this chunk, potentially 158106c3fb27SDimitry Andric // a bit less well distributed among threads. It is designed to make sure that 158206c3fb27SDimitry Andric // threads will receive predictable chunks, deterministically (so that next nest 158306c3fb27SDimitry Andric // of loops with similar characteristics will get exactly same chunks on same 158406c3fb27SDimitry Andric // threads). 158506c3fb27SDimitry Andric // Current contract: chunk_bounds_nest has only lb0 and ub0, 158606c3fb27SDimitry Andric // lb1 and ub1 are set to 0 and can be ignored. (This may change in the future). 158706c3fb27SDimitry Andric extern "C" kmp_int32 158806c3fb27SDimitry Andric __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid, 158906c3fb27SDimitry Andric /*in/out*/ bounds_info_t *original_bounds_nest, 159006c3fb27SDimitry Andric /*out*/ bounds_info_t *chunk_bounds_nest, 159106c3fb27SDimitry Andric kmp_index_t n, /*out*/ kmp_int32 *plastiter) { 159206c3fb27SDimitry Andric 159306c3fb27SDimitry Andric KMP_DEBUG_ASSERT(plastiter && original_bounds_nest); 159406c3fb27SDimitry Andric KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid)); 159506c3fb27SDimitry Andric 159606c3fb27SDimitry Andric if (__kmp_env_consistency_check) { 159706c3fb27SDimitry Andric __kmp_push_workshare(gtid, ct_pdo, loc); 159806c3fb27SDimitry Andric } 159906c3fb27SDimitry Andric 160006c3fb27SDimitry Andric kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n); 160106c3fb27SDimitry Andric 16025f757f3fSDimitry Andric CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n); 160306c3fb27SDimitry Andric 160406c3fb27SDimitry Andric for (kmp_index_t i = 0; i < n; ++i) { 160506c3fb27SDimitry Andric updated_bounds_nest[i].b = original_bounds_nest[i]; 160606c3fb27SDimitry Andric } 160706c3fb27SDimitry Andric 160806c3fb27SDimitry Andric kmp_loop_nest_iv_t total = 160906c3fb27SDimitry Andric kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n); 161006c3fb27SDimitry Andric 161106c3fb27SDimitry Andric if (plastiter != NULL) { 161206c3fb27SDimitry Andric *plastiter = FALSE; 161306c3fb27SDimitry Andric } 161406c3fb27SDimitry Andric 161506c3fb27SDimitry Andric if (total == 0) { 161606c3fb27SDimitry Andric // Loop won't execute: 161706c3fb27SDimitry Andric return FALSE; 161806c3fb27SDimitry Andric } 161906c3fb27SDimitry Andric 162006c3fb27SDimitry Andric // OMPTODO: DISTRIBUTE is not supported yet 162106c3fb27SDimitry Andric __kmp_assert_valid_gtid(gtid); 162206c3fb27SDimitry Andric kmp_uint32 tid = __kmp_tid_from_gtid(gtid); 162306c3fb27SDimitry Andric 162406c3fb27SDimitry Andric kmp_info_t *th = __kmp_threads[gtid]; 162506c3fb27SDimitry Andric kmp_team_t *team = th->th.th_team; 162606c3fb27SDimitry Andric kmp_uint32 nth = team->t.t_nproc; // Number of threads 162706c3fb27SDimitry Andric 162806c3fb27SDimitry Andric KMP_DEBUG_ASSERT(tid < nth); 162906c3fb27SDimitry Andric 1630*0fca6ea1SDimitry Andric // Handle special cases 1631*0fca6ea1SDimitry Andric nested_loop_type_t loop_type = 1632*0fca6ea1SDimitry Andric kmp_identify_nested_loop_structure(original_bounds_nest, n); 1633*0fca6ea1SDimitry Andric if (loop_type == nested_loop_type_lower_triangular_matrix) { 1634*0fca6ea1SDimitry Andric kmp_handle_lower_triangle_matrix(nth, tid, n, original_bounds_nest, 1635*0fca6ea1SDimitry Andric chunk_bounds_nest); 1636*0fca6ea1SDimitry Andric return TRUE; 1637*0fca6ea1SDimitry Andric } else if (loop_type == nested_loop_type_upper_triangular_matrix) { 1638*0fca6ea1SDimitry Andric kmp_handle_upper_triangle_matrix(nth, tid, n, original_bounds_nest, 1639*0fca6ea1SDimitry Andric chunk_bounds_nest); 1640*0fca6ea1SDimitry Andric return TRUE; 1641*0fca6ea1SDimitry Andric } 1642*0fca6ea1SDimitry Andric 16435f757f3fSDimitry Andric CollapseAllocator<kmp_uint64> original_ivs_start(n); 164406c3fb27SDimitry Andric 164506c3fb27SDimitry Andric if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n, 164606c3fb27SDimitry Andric /*out*/ original_ivs_start)) { 164706c3fb27SDimitry Andric // Loop won't execute: 164806c3fb27SDimitry Andric return FALSE; 164906c3fb27SDimitry Andric } 165006c3fb27SDimitry Andric 165106c3fb27SDimitry Andric // Not doing this optimization for one thread: 165206c3fb27SDimitry Andric // (1) more to test 165306c3fb27SDimitry Andric // (2) without it current contract that chunk_bounds_nest has only lb0 and 165406c3fb27SDimitry Andric // ub0, lb1 and ub1 are set to 0 and can be ignored. 165506c3fb27SDimitry Andric // if (nth == 1) { 165606c3fb27SDimitry Andric // // One thread: 165706c3fb27SDimitry Andric // // Copy all info from original_bounds_nest, it'll be good enough. 165806c3fb27SDimitry Andric 165906c3fb27SDimitry Andric // for (kmp_index_t i = 0; i < n; ++i) { 166006c3fb27SDimitry Andric // chunk_bounds_nest[i] = original_bounds_nest[i]; 166106c3fb27SDimitry Andric // } 166206c3fb27SDimitry Andric 166306c3fb27SDimitry Andric // if (plastiter != NULL) { 166406c3fb27SDimitry Andric // *plastiter = TRUE; 166506c3fb27SDimitry Andric // } 166606c3fb27SDimitry Andric // return TRUE; 166706c3fb27SDimitry Andric //} 166806c3fb27SDimitry Andric 166906c3fb27SDimitry Andric kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs( 167006c3fb27SDimitry Andric updated_bounds_nest, original_ivs_start, n); 167106c3fb27SDimitry Andric 167206c3fb27SDimitry Andric bool last_iter = false; 167306c3fb27SDimitry Andric 167406c3fb27SDimitry Andric for (; nth > 0;) { 167506c3fb27SDimitry Andric // We could calculate chunk size once, but this is to compensate that the 167606c3fb27SDimitry Andric // original space is not parallelepiped and some threads can be left 167706c3fb27SDimitry Andric // without work: 167806c3fb27SDimitry Andric KMP_DEBUG_ASSERT(total >= new_iv); 167906c3fb27SDimitry Andric 168006c3fb27SDimitry Andric kmp_loop_nest_iv_t total_left = total - new_iv; 168106c3fb27SDimitry Andric kmp_loop_nest_iv_t chunk_size = total_left / nth; 168206c3fb27SDimitry Andric kmp_loop_nest_iv_t remainder = total_left % nth; 168306c3fb27SDimitry Andric 168406c3fb27SDimitry Andric kmp_loop_nest_iv_t curr_chunk_size = chunk_size; 168506c3fb27SDimitry Andric 168606c3fb27SDimitry Andric if (remainder > 0) { 168706c3fb27SDimitry Andric ++curr_chunk_size; 168806c3fb27SDimitry Andric --remainder; 168906c3fb27SDimitry Andric } 169006c3fb27SDimitry Andric 169106c3fb27SDimitry Andric #if defined(KMP_DEBUG) 169206c3fb27SDimitry Andric kmp_loop_nest_iv_t new_iv_for_start = new_iv; 169306c3fb27SDimitry Andric #endif 169406c3fb27SDimitry Andric 169506c3fb27SDimitry Andric if (curr_chunk_size > 1) { 169606c3fb27SDimitry Andric new_iv += curr_chunk_size - 1; 169706c3fb27SDimitry Andric } 169806c3fb27SDimitry Andric 16995f757f3fSDimitry Andric CollapseAllocator<kmp_uint64> original_ivs_end(n); 170006c3fb27SDimitry Andric if ((nth == 1) || (new_iv >= total - 1)) { 170106c3fb27SDimitry Andric // Do this one till the end - just in case we miscalculated 170206c3fb27SDimitry Andric // and either too much is left to process or new_iv is a bit too big: 170306c3fb27SDimitry Andric kmp_calc_original_ivs_for_end(original_bounds_nest, n, 170406c3fb27SDimitry Andric /*out*/ original_ivs_end); 170506c3fb27SDimitry Andric 170606c3fb27SDimitry Andric last_iter = true; 170706c3fb27SDimitry Andric } else { 170806c3fb27SDimitry Andric // Note: here we make sure it's past (or equal to) the previous point. 170906c3fb27SDimitry Andric if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n, 171006c3fb27SDimitry Andric updated_bounds_nest, 171106c3fb27SDimitry Andric original_ivs_start, new_iv, 171206c3fb27SDimitry Andric /*out*/ original_ivs_end)) { 171306c3fb27SDimitry Andric // We could not find the ending point, use the original upper bounds: 171406c3fb27SDimitry Andric kmp_calc_original_ivs_for_end(original_bounds_nest, n, 171506c3fb27SDimitry Andric /*out*/ original_ivs_end); 171606c3fb27SDimitry Andric 171706c3fb27SDimitry Andric last_iter = true; 171806c3fb27SDimitry Andric } 171906c3fb27SDimitry Andric } 172006c3fb27SDimitry Andric 172106c3fb27SDimitry Andric #if defined(KMP_DEBUG) 172206c3fb27SDimitry Andric auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs( 172306c3fb27SDimitry Andric updated_bounds_nest, original_ivs_end, n); 172406c3fb27SDimitry Andric KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start); 172506c3fb27SDimitry Andric #endif 172606c3fb27SDimitry Andric 172706c3fb27SDimitry Andric if (last_iter && (tid != 0)) { 172806c3fb27SDimitry Andric // We are done, this was last chunk, but no chunk for current thread was 172906c3fb27SDimitry Andric // found: 173006c3fb27SDimitry Andric return FALSE; 173106c3fb27SDimitry Andric } 173206c3fb27SDimitry Andric 173306c3fb27SDimitry Andric if (tid == 0) { 173406c3fb27SDimitry Andric // We found the chunk for this thread, now we need to check if it's the 173506c3fb27SDimitry Andric // last chunk or not: 173606c3fb27SDimitry Andric 17375f757f3fSDimitry Andric CollapseAllocator<kmp_uint64> original_ivs_next_start(n); 173806c3fb27SDimitry Andric if (last_iter || 173906c3fb27SDimitry Andric !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end, 174006c3fb27SDimitry Andric /*out*/ original_ivs_next_start)) { 174106c3fb27SDimitry Andric // no more loop iterations left to process, 174206c3fb27SDimitry Andric // this means that currently found chunk is the last chunk: 174306c3fb27SDimitry Andric if (plastiter != NULL) { 174406c3fb27SDimitry Andric *plastiter = TRUE; 174506c3fb27SDimitry Andric } 174606c3fb27SDimitry Andric } 174706c3fb27SDimitry Andric 174806c3fb27SDimitry Andric // Fill in chunk bounds: 174906c3fb27SDimitry Andric for (kmp_index_t i = 0; i < n; ++i) { 175006c3fb27SDimitry Andric chunk_bounds_nest[i] = 175106c3fb27SDimitry Andric original_bounds_nest[i]; // To fill in types, etc. - optional 175206c3fb27SDimitry Andric chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i]; 175306c3fb27SDimitry Andric chunk_bounds_nest[i].lb1_u64 = 0; 175406c3fb27SDimitry Andric 175506c3fb27SDimitry Andric chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i]; 175606c3fb27SDimitry Andric chunk_bounds_nest[i].ub1_u64 = 0; 175706c3fb27SDimitry Andric } 175806c3fb27SDimitry Andric 175906c3fb27SDimitry Andric return TRUE; 176006c3fb27SDimitry Andric } 176106c3fb27SDimitry Andric 176206c3fb27SDimitry Andric --tid; 176306c3fb27SDimitry Andric --nth; 176406c3fb27SDimitry Andric 176506c3fb27SDimitry Andric bool next_chunk = kmp_calc_next_original_ivs( 176606c3fb27SDimitry Andric original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start); 176706c3fb27SDimitry Andric if (!next_chunk) { 176806c3fb27SDimitry Andric // no more loop iterations to process, 176906c3fb27SDimitry Andric // the prevoius chunk was the last chunk 177006c3fb27SDimitry Andric break; 177106c3fb27SDimitry Andric } 177206c3fb27SDimitry Andric 177306c3fb27SDimitry Andric // original_ivs_start is next to previous chunk original_ivs_end, 177406c3fb27SDimitry Andric // we need to start new chunk here, so chunks will be one after another 177506c3fb27SDimitry Andric // without any gap or overlap: 177606c3fb27SDimitry Andric new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest, 177706c3fb27SDimitry Andric original_ivs_start, n); 177806c3fb27SDimitry Andric } 177906c3fb27SDimitry Andric 178006c3fb27SDimitry Andric return FALSE; 178106c3fb27SDimitry Andric } 1782