xref: /freebsd-src/contrib/llvm-project/openmp/runtime/src/kmp_collapse.cpp (revision 06c3fb2749bda94cb5201f81ffdb8fa6c3161b2e)
1*06c3fb27SDimitry Andric /*
2*06c3fb27SDimitry Andric  * kmp_collapse.cpp -- loop collapse feature
3*06c3fb27SDimitry Andric  */
4*06c3fb27SDimitry Andric 
5*06c3fb27SDimitry Andric //===----------------------------------------------------------------------===//
6*06c3fb27SDimitry Andric //
7*06c3fb27SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8*06c3fb27SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
9*06c3fb27SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10*06c3fb27SDimitry Andric //
11*06c3fb27SDimitry Andric //===----------------------------------------------------------------------===//
12*06c3fb27SDimitry Andric 
13*06c3fb27SDimitry Andric #include "kmp.h"
14*06c3fb27SDimitry Andric #include "kmp_error.h"
15*06c3fb27SDimitry Andric #include "kmp_i18n.h"
16*06c3fb27SDimitry Andric #include "kmp_itt.h"
17*06c3fb27SDimitry Andric #include "kmp_stats.h"
18*06c3fb27SDimitry Andric #include "kmp_str.h"
19*06c3fb27SDimitry Andric #include "kmp_collapse.h"
20*06c3fb27SDimitry Andric 
21*06c3fb27SDimitry Andric #if OMPT_SUPPORT
22*06c3fb27SDimitry Andric #include "ompt-specific.h"
23*06c3fb27SDimitry Andric #endif
24*06c3fb27SDimitry Andric 
25*06c3fb27SDimitry Andric // OMPTODO: different style of comments (see kmp_sched)
26*06c3fb27SDimitry Andric // OMPTODO: OMPT/OMPD
27*06c3fb27SDimitry Andric 
28*06c3fb27SDimitry Andric // avoid inadevertently using a library based abs
29*06c3fb27SDimitry Andric template <typename T> T __kmp_abs(const T val) {
30*06c3fb27SDimitry Andric   return (val < 0) ? -val: val;
31*06c3fb27SDimitry Andric }
32*06c3fb27SDimitry Andric kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; }
33*06c3fb27SDimitry Andric kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }
34*06c3fb27SDimitry Andric 
35*06c3fb27SDimitry Andric //----------------------------------------------------------------------------
36*06c3fb27SDimitry Andric // Common functions for working with rectangular and non-rectangular loops
37*06c3fb27SDimitry Andric //----------------------------------------------------------------------------
38*06c3fb27SDimitry Andric 
39*06c3fb27SDimitry Andric template <typename T> int __kmp_sign(T val) { return (T(0) < val) - (val < T(0)); }
40*06c3fb27SDimitry Andric 
41*06c3fb27SDimitry Andric //----------Loop canonicalization---------------------------------------------
42*06c3fb27SDimitry Andric 
43*06c3fb27SDimitry Andric // For loop nest (any shape):
44*06c3fb27SDimitry Andric // convert != to < or >;
45*06c3fb27SDimitry Andric // switch from using < or > to <= or >=.
46*06c3fb27SDimitry Andric // "bounds" array has to be allocated per thread.
47*06c3fb27SDimitry Andric // All other internal functions will work only with canonicalized loops.
48*06c3fb27SDimitry Andric template <typename T>
49*06c3fb27SDimitry Andric void kmp_canonicalize_one_loop_XX(
50*06c3fb27SDimitry Andric     ident_t *loc,
51*06c3fb27SDimitry Andric     /*in/out*/ bounds_infoXX_template<T> *bounds) {
52*06c3fb27SDimitry Andric 
53*06c3fb27SDimitry Andric   if (__kmp_env_consistency_check) {
54*06c3fb27SDimitry Andric     if (bounds->step == 0) {
55*06c3fb27SDimitry Andric       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
56*06c3fb27SDimitry Andric                             loc);
57*06c3fb27SDimitry Andric     }
58*06c3fb27SDimitry Andric   }
59*06c3fb27SDimitry Andric 
60*06c3fb27SDimitry Andric   if (bounds->comparison == comparison_t::comp_not_eq) {
61*06c3fb27SDimitry Andric     // We can convert this to < or >, depends on the sign of the step:
62*06c3fb27SDimitry Andric     if (bounds->step > 0) {
63*06c3fb27SDimitry Andric       bounds->comparison = comparison_t::comp_less;
64*06c3fb27SDimitry Andric     } else {
65*06c3fb27SDimitry Andric       bounds->comparison = comparison_t::comp_greater;
66*06c3fb27SDimitry Andric     }
67*06c3fb27SDimitry Andric   }
68*06c3fb27SDimitry Andric 
69*06c3fb27SDimitry Andric   if (bounds->comparison == comparison_t::comp_less) {
70*06c3fb27SDimitry Andric     // Note: ub0 can be unsigned. Should be Ok to hit overflow here,
71*06c3fb27SDimitry Andric     // because ub0 + ub1*j should be still positive (otherwise loop was not
72*06c3fb27SDimitry Andric     // well formed)
73*06c3fb27SDimitry Andric     bounds->ub0 -= 1;
74*06c3fb27SDimitry Andric     bounds->comparison = comparison_t::comp_less_or_eq;
75*06c3fb27SDimitry Andric   } else if (bounds->comparison == comparison_t::comp_greater) {
76*06c3fb27SDimitry Andric     bounds->ub0 += 1;
77*06c3fb27SDimitry Andric     bounds->comparison = comparison_t::comp_greater_or_eq;
78*06c3fb27SDimitry Andric   }
79*06c3fb27SDimitry Andric }
80*06c3fb27SDimitry Andric 
81*06c3fb27SDimitry Andric // Canonicalize loop nest. original_bounds_nest is an array of length n.
82*06c3fb27SDimitry Andric void kmp_canonicalize_loop_nest(ident_t *loc,
83*06c3fb27SDimitry Andric                                 /*in/out*/ bounds_info_t *original_bounds_nest,
84*06c3fb27SDimitry Andric                                 kmp_index_t n) {
85*06c3fb27SDimitry Andric 
86*06c3fb27SDimitry Andric   for (kmp_index_t ind = 0; ind < n; ++ind) {
87*06c3fb27SDimitry Andric     auto bounds = &(original_bounds_nest[ind]);
88*06c3fb27SDimitry Andric 
89*06c3fb27SDimitry Andric     switch (bounds->loop_type) {
90*06c3fb27SDimitry Andric     case loop_type_t::loop_type_int32:
91*06c3fb27SDimitry Andric       kmp_canonicalize_one_loop_XX<kmp_int32>(
92*06c3fb27SDimitry Andric           loc,
93*06c3fb27SDimitry Andric           /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
94*06c3fb27SDimitry Andric       break;
95*06c3fb27SDimitry Andric     case loop_type_t::loop_type_uint32:
96*06c3fb27SDimitry Andric       kmp_canonicalize_one_loop_XX<kmp_uint32>(
97*06c3fb27SDimitry Andric           loc,
98*06c3fb27SDimitry Andric           /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
99*06c3fb27SDimitry Andric       break;
100*06c3fb27SDimitry Andric     case loop_type_t::loop_type_int64:
101*06c3fb27SDimitry Andric       kmp_canonicalize_one_loop_XX<kmp_int64>(
102*06c3fb27SDimitry Andric           loc,
103*06c3fb27SDimitry Andric           /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
104*06c3fb27SDimitry Andric       break;
105*06c3fb27SDimitry Andric     case loop_type_t::loop_type_uint64:
106*06c3fb27SDimitry Andric       kmp_canonicalize_one_loop_XX<kmp_uint64>(
107*06c3fb27SDimitry Andric           loc,
108*06c3fb27SDimitry Andric           /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
109*06c3fb27SDimitry Andric       break;
110*06c3fb27SDimitry Andric     default:
111*06c3fb27SDimitry Andric       KMP_ASSERT(false);
112*06c3fb27SDimitry Andric     }
113*06c3fb27SDimitry Andric   }
114*06c3fb27SDimitry Andric }
115*06c3fb27SDimitry Andric 
116*06c3fb27SDimitry Andric //----------Calculating trip count on one level-------------------------------
117*06c3fb27SDimitry Andric 
118*06c3fb27SDimitry Andric // Calculate trip count on this loop level.
119*06c3fb27SDimitry Andric // We do this either for a rectangular loop nest,
120*06c3fb27SDimitry Andric // or after an adjustment bringing the loops to a parallelepiped shape.
121*06c3fb27SDimitry Andric // This number should not depend on the value of outer IV
122*06c3fb27SDimitry Andric // even if the formular has lb1 and ub1.
123*06c3fb27SDimitry Andric // Note: for non-rectangular loops don't use span for this, it's too big.
124*06c3fb27SDimitry Andric 
125*06c3fb27SDimitry Andric template <typename T>
126*06c3fb27SDimitry Andric kmp_loop_nest_iv_t kmp_calculate_trip_count_XX(
127*06c3fb27SDimitry Andric     /*in/out*/ bounds_infoXX_template<T> *bounds) {
128*06c3fb27SDimitry Andric 
129*06c3fb27SDimitry Andric   if (bounds->comparison == comparison_t::comp_less_or_eq) {
130*06c3fb27SDimitry Andric     if (bounds->ub0 < bounds->lb0) {
131*06c3fb27SDimitry Andric       // Note: after this we don't need to calculate inner loops,
132*06c3fb27SDimitry Andric       // but that should be an edge case:
133*06c3fb27SDimitry Andric       bounds->trip_count = 0;
134*06c3fb27SDimitry Andric     } else {
135*06c3fb27SDimitry Andric       // ub - lb may exceed signed type range; we need to cast to
136*06c3fb27SDimitry Andric       // kmp_loop_nest_iv_t anyway
137*06c3fb27SDimitry Andric       bounds->trip_count =
138*06c3fb27SDimitry Andric           static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) /
139*06c3fb27SDimitry Andric               __kmp_abs(bounds->step) +
140*06c3fb27SDimitry Andric           1;
141*06c3fb27SDimitry Andric     }
142*06c3fb27SDimitry Andric   } else if (bounds->comparison == comparison_t::comp_greater_or_eq) {
143*06c3fb27SDimitry Andric     if (bounds->lb0 < bounds->ub0) {
144*06c3fb27SDimitry Andric       // Note: after this we don't need to calculate inner loops,
145*06c3fb27SDimitry Andric       // but that should be an edge case:
146*06c3fb27SDimitry Andric       bounds->trip_count = 0;
147*06c3fb27SDimitry Andric     } else {
148*06c3fb27SDimitry Andric       // lb - ub may exceed signed type range; we need to cast to
149*06c3fb27SDimitry Andric       // kmp_loop_nest_iv_t anyway
150*06c3fb27SDimitry Andric       bounds->trip_count =
151*06c3fb27SDimitry Andric           static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) /
152*06c3fb27SDimitry Andric               __kmp_abs(bounds->step) +
153*06c3fb27SDimitry Andric           1;
154*06c3fb27SDimitry Andric     }
155*06c3fb27SDimitry Andric   } else {
156*06c3fb27SDimitry Andric     KMP_ASSERT(false);
157*06c3fb27SDimitry Andric   }
158*06c3fb27SDimitry Andric   return bounds->trip_count;
159*06c3fb27SDimitry Andric }
160*06c3fb27SDimitry Andric 
161*06c3fb27SDimitry Andric // Calculate trip count on this loop level.
162*06c3fb27SDimitry Andric kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) {
163*06c3fb27SDimitry Andric 
164*06c3fb27SDimitry Andric   kmp_loop_nest_iv_t trip_count = 0;
165*06c3fb27SDimitry Andric 
166*06c3fb27SDimitry Andric   switch (bounds->loop_type) {
167*06c3fb27SDimitry Andric   case loop_type_t::loop_type_int32:
168*06c3fb27SDimitry Andric     trip_count = kmp_calculate_trip_count_XX<kmp_int32>(
169*06c3fb27SDimitry Andric         /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
170*06c3fb27SDimitry Andric     break;
171*06c3fb27SDimitry Andric   case loop_type_t::loop_type_uint32:
172*06c3fb27SDimitry Andric     trip_count = kmp_calculate_trip_count_XX<kmp_uint32>(
173*06c3fb27SDimitry Andric         /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
174*06c3fb27SDimitry Andric     break;
175*06c3fb27SDimitry Andric   case loop_type_t::loop_type_int64:
176*06c3fb27SDimitry Andric     trip_count = kmp_calculate_trip_count_XX<kmp_int64>(
177*06c3fb27SDimitry Andric         /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
178*06c3fb27SDimitry Andric     break;
179*06c3fb27SDimitry Andric   case loop_type_t::loop_type_uint64:
180*06c3fb27SDimitry Andric     trip_count = kmp_calculate_trip_count_XX<kmp_uint64>(
181*06c3fb27SDimitry Andric         /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
182*06c3fb27SDimitry Andric     break;
183*06c3fb27SDimitry Andric   default:
184*06c3fb27SDimitry Andric     KMP_ASSERT(false);
185*06c3fb27SDimitry Andric   }
186*06c3fb27SDimitry Andric 
187*06c3fb27SDimitry Andric   return trip_count;
188*06c3fb27SDimitry Andric }
189*06c3fb27SDimitry Andric 
190*06c3fb27SDimitry Andric //----------Trim original iv according to its type----------------------------
191*06c3fb27SDimitry Andric 
192*06c3fb27SDimitry Andric // Trim original iv according to its type.
193*06c3fb27SDimitry Andric // Return kmp_uint64 value which can be easily used in all internal calculations
194*06c3fb27SDimitry Andric // And can be statically cast back to original type in user code.
195*06c3fb27SDimitry Andric kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) {
196*06c3fb27SDimitry Andric   kmp_uint64 res = 0;
197*06c3fb27SDimitry Andric 
198*06c3fb27SDimitry Andric   switch (loop_iv_type) {
199*06c3fb27SDimitry Andric   case loop_type_t::loop_type_int8:
200*06c3fb27SDimitry Andric     res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv));
201*06c3fb27SDimitry Andric     break;
202*06c3fb27SDimitry Andric   case loop_type_t::loop_type_uint8:
203*06c3fb27SDimitry Andric     res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv));
204*06c3fb27SDimitry Andric     break;
205*06c3fb27SDimitry Andric   case loop_type_t::loop_type_int16:
206*06c3fb27SDimitry Andric     res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv));
207*06c3fb27SDimitry Andric     break;
208*06c3fb27SDimitry Andric   case loop_type_t::loop_type_uint16:
209*06c3fb27SDimitry Andric     res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv));
210*06c3fb27SDimitry Andric     break;
211*06c3fb27SDimitry Andric   case loop_type_t::loop_type_int32:
212*06c3fb27SDimitry Andric     res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv));
213*06c3fb27SDimitry Andric     break;
214*06c3fb27SDimitry Andric   case loop_type_t::loop_type_uint32:
215*06c3fb27SDimitry Andric     res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv));
216*06c3fb27SDimitry Andric     break;
217*06c3fb27SDimitry Andric   case loop_type_t::loop_type_int64:
218*06c3fb27SDimitry Andric     res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv));
219*06c3fb27SDimitry Andric     break;
220*06c3fb27SDimitry Andric   case loop_type_t::loop_type_uint64:
221*06c3fb27SDimitry Andric     res = static_cast<kmp_uint64>(original_iv);
222*06c3fb27SDimitry Andric     break;
223*06c3fb27SDimitry Andric   default:
224*06c3fb27SDimitry Andric     KMP_ASSERT(false);
225*06c3fb27SDimitry Andric   }
226*06c3fb27SDimitry Andric 
227*06c3fb27SDimitry Andric   return res;
228*06c3fb27SDimitry Andric }
229*06c3fb27SDimitry Andric 
230*06c3fb27SDimitry Andric //----------Compare two IVs (remember they have a type)-----------------------
231*06c3fb27SDimitry Andric 
232*06c3fb27SDimitry Andric bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1,
233*06c3fb27SDimitry Andric                 kmp_uint64 original_iv2) {
234*06c3fb27SDimitry Andric   bool res = false;
235*06c3fb27SDimitry Andric 
236*06c3fb27SDimitry Andric   switch (loop_iv_type) {
237*06c3fb27SDimitry Andric   case loop_type_t::loop_type_int8:
238*06c3fb27SDimitry Andric     res = static_cast<kmp_int8>(original_iv1) ==
239*06c3fb27SDimitry Andric           static_cast<kmp_int8>(original_iv2);
240*06c3fb27SDimitry Andric     break;
241*06c3fb27SDimitry Andric   case loop_type_t::loop_type_uint8:
242*06c3fb27SDimitry Andric     res = static_cast<kmp_uint8>(original_iv1) ==
243*06c3fb27SDimitry Andric           static_cast<kmp_uint8>(original_iv2);
244*06c3fb27SDimitry Andric     break;
245*06c3fb27SDimitry Andric   case loop_type_t::loop_type_int16:
246*06c3fb27SDimitry Andric     res = static_cast<kmp_int16>(original_iv1) ==
247*06c3fb27SDimitry Andric           static_cast<kmp_int16>(original_iv2);
248*06c3fb27SDimitry Andric     break;
249*06c3fb27SDimitry Andric   case loop_type_t::loop_type_uint16:
250*06c3fb27SDimitry Andric     res = static_cast<kmp_uint16>(original_iv1) ==
251*06c3fb27SDimitry Andric           static_cast<kmp_uint16>(original_iv2);
252*06c3fb27SDimitry Andric     break;
253*06c3fb27SDimitry Andric   case loop_type_t::loop_type_int32:
254*06c3fb27SDimitry Andric     res = static_cast<kmp_int32>(original_iv1) ==
255*06c3fb27SDimitry Andric           static_cast<kmp_int32>(original_iv2);
256*06c3fb27SDimitry Andric     break;
257*06c3fb27SDimitry Andric   case loop_type_t::loop_type_uint32:
258*06c3fb27SDimitry Andric     res = static_cast<kmp_uint32>(original_iv1) ==
259*06c3fb27SDimitry Andric           static_cast<kmp_uint32>(original_iv2);
260*06c3fb27SDimitry Andric     break;
261*06c3fb27SDimitry Andric   case loop_type_t::loop_type_int64:
262*06c3fb27SDimitry Andric     res = static_cast<kmp_int64>(original_iv1) ==
263*06c3fb27SDimitry Andric           static_cast<kmp_int64>(original_iv2);
264*06c3fb27SDimitry Andric     break;
265*06c3fb27SDimitry Andric   case loop_type_t::loop_type_uint64:
266*06c3fb27SDimitry Andric     res = static_cast<kmp_uint64>(original_iv1) ==
267*06c3fb27SDimitry Andric           static_cast<kmp_uint64>(original_iv2);
268*06c3fb27SDimitry Andric     break;
269*06c3fb27SDimitry Andric   default:
270*06c3fb27SDimitry Andric     KMP_ASSERT(false);
271*06c3fb27SDimitry Andric   }
272*06c3fb27SDimitry Andric 
273*06c3fb27SDimitry Andric   return res;
274*06c3fb27SDimitry Andric }
275*06c3fb27SDimitry Andric 
276*06c3fb27SDimitry Andric //----------Calculate original iv on one level--------------------------------
277*06c3fb27SDimitry Andric 
278*06c3fb27SDimitry Andric // Return true if the point fits into upper bounds on this level,
279*06c3fb27SDimitry Andric // false otherwise
280*06c3fb27SDimitry Andric template <typename T>
281*06c3fb27SDimitry Andric bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds,
282*06c3fb27SDimitry Andric                                  const kmp_point_t original_ivs,
283*06c3fb27SDimitry Andric                                  kmp_index_t ind) {
284*06c3fb27SDimitry Andric 
285*06c3fb27SDimitry Andric   T iv = static_cast<T>(original_ivs[ind]);
286*06c3fb27SDimitry Andric   T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
287*06c3fb27SDimitry Andric 
288*06c3fb27SDimitry Andric   if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
289*06c3fb27SDimitry Andric        (iv > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
290*06c3fb27SDimitry Andric       ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
291*06c3fb27SDimitry Andric        (iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
292*06c3fb27SDimitry Andric     // The calculated point is outside of loop upper boundary:
293*06c3fb27SDimitry Andric     return false;
294*06c3fb27SDimitry Andric   }
295*06c3fb27SDimitry Andric 
296*06c3fb27SDimitry Andric   return true;
297*06c3fb27SDimitry Andric }
298*06c3fb27SDimitry Andric 
299*06c3fb27SDimitry Andric // Calculate one iv corresponding to iteration on the level ind.
300*06c3fb27SDimitry Andric // Return true if it fits into lower-upper bounds on this level
301*06c3fb27SDimitry Andric // (if not, we need to re-calculate)
302*06c3fb27SDimitry Andric template <typename T>
303*06c3fb27SDimitry Andric bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds,
304*06c3fb27SDimitry Andric                         /*in/out*/ kmp_point_t original_ivs,
305*06c3fb27SDimitry Andric                         const kmp_iterations_t iterations, kmp_index_t ind,
306*06c3fb27SDimitry Andric                         bool start_with_lower_bound, bool checkBounds) {
307*06c3fb27SDimitry Andric 
308*06c3fb27SDimitry Andric   kmp_uint64 temp = 0;
309*06c3fb27SDimitry Andric   T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
310*06c3fb27SDimitry Andric 
311*06c3fb27SDimitry Andric   if (start_with_lower_bound) {
312*06c3fb27SDimitry Andric     // we moved to the next iteration on one of outer loops, should start
313*06c3fb27SDimitry Andric     // with the lower bound here:
314*06c3fb27SDimitry Andric     temp = bounds->lb0 + bounds->lb1 * outer_iv;
315*06c3fb27SDimitry Andric   } else {
316*06c3fb27SDimitry Andric     auto iteration = iterations[ind];
317*06c3fb27SDimitry Andric     temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step;
318*06c3fb27SDimitry Andric   }
319*06c3fb27SDimitry Andric 
320*06c3fb27SDimitry Andric   // Now trim original iv according to its type:
321*06c3fb27SDimitry Andric   original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
322*06c3fb27SDimitry Andric 
323*06c3fb27SDimitry Andric   if (checkBounds) {
324*06c3fb27SDimitry Andric     return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind);
325*06c3fb27SDimitry Andric   } else {
326*06c3fb27SDimitry Andric     return true;
327*06c3fb27SDimitry Andric   }
328*06c3fb27SDimitry Andric }
329*06c3fb27SDimitry Andric 
330*06c3fb27SDimitry Andric bool kmp_calc_one_iv(const bounds_info_t *bounds,
331*06c3fb27SDimitry Andric                      /*in/out*/ kmp_point_t original_ivs,
332*06c3fb27SDimitry Andric                      const kmp_iterations_t iterations, kmp_index_t ind,
333*06c3fb27SDimitry Andric                      bool start_with_lower_bound, bool checkBounds) {
334*06c3fb27SDimitry Andric 
335*06c3fb27SDimitry Andric   switch (bounds->loop_type) {
336*06c3fb27SDimitry Andric   case loop_type_t::loop_type_int32:
337*06c3fb27SDimitry Andric     return kmp_calc_one_iv_XX<kmp_int32>(
338*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int32> *)(bounds),
339*06c3fb27SDimitry Andric         /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
340*06c3fb27SDimitry Andric         checkBounds);
341*06c3fb27SDimitry Andric     break;
342*06c3fb27SDimitry Andric   case loop_type_t::loop_type_uint32:
343*06c3fb27SDimitry Andric     return kmp_calc_one_iv_XX<kmp_uint32>(
344*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint32> *)(bounds),
345*06c3fb27SDimitry Andric         /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
346*06c3fb27SDimitry Andric         checkBounds);
347*06c3fb27SDimitry Andric     break;
348*06c3fb27SDimitry Andric   case loop_type_t::loop_type_int64:
349*06c3fb27SDimitry Andric     return kmp_calc_one_iv_XX<kmp_int64>(
350*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int64> *)(bounds),
351*06c3fb27SDimitry Andric         /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
352*06c3fb27SDimitry Andric         checkBounds);
353*06c3fb27SDimitry Andric     break;
354*06c3fb27SDimitry Andric   case loop_type_t::loop_type_uint64:
355*06c3fb27SDimitry Andric     return kmp_calc_one_iv_XX<kmp_uint64>(
356*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint64> *)(bounds),
357*06c3fb27SDimitry Andric         /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
358*06c3fb27SDimitry Andric         checkBounds);
359*06c3fb27SDimitry Andric     break;
360*06c3fb27SDimitry Andric   default:
361*06c3fb27SDimitry Andric     KMP_ASSERT(false);
362*06c3fb27SDimitry Andric     return false;
363*06c3fb27SDimitry Andric   }
364*06c3fb27SDimitry Andric }
365*06c3fb27SDimitry Andric 
366*06c3fb27SDimitry Andric //----------Calculate original iv on one level for rectangular loop nest------
367*06c3fb27SDimitry Andric 
368*06c3fb27SDimitry Andric // Calculate one iv corresponding to iteration on the level ind.
369*06c3fb27SDimitry Andric // Return true if it fits into lower-upper bounds on this level
370*06c3fb27SDimitry Andric // (if not, we need to re-calculate)
371*06c3fb27SDimitry Andric template <typename T>
372*06c3fb27SDimitry Andric void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds,
373*06c3fb27SDimitry Andric                                 /*in/out*/ kmp_uint64 *original_ivs,
374*06c3fb27SDimitry Andric                                 const kmp_iterations_t iterations,
375*06c3fb27SDimitry Andric                                 kmp_index_t ind) {
376*06c3fb27SDimitry Andric 
377*06c3fb27SDimitry Andric   auto iteration = iterations[ind];
378*06c3fb27SDimitry Andric 
379*06c3fb27SDimitry Andric   kmp_uint64 temp =
380*06c3fb27SDimitry Andric       bounds->lb0 +
381*06c3fb27SDimitry Andric       bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) +
382*06c3fb27SDimitry Andric       iteration * bounds->step;
383*06c3fb27SDimitry Andric 
384*06c3fb27SDimitry Andric   // Now trim original iv according to its type:
385*06c3fb27SDimitry Andric   original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
386*06c3fb27SDimitry Andric }
387*06c3fb27SDimitry Andric 
388*06c3fb27SDimitry Andric void kmp_calc_one_iv_rectang(const bounds_info_t *bounds,
389*06c3fb27SDimitry Andric                              /*in/out*/ kmp_uint64 *original_ivs,
390*06c3fb27SDimitry Andric                              const kmp_iterations_t iterations,
391*06c3fb27SDimitry Andric                              kmp_index_t ind) {
392*06c3fb27SDimitry Andric 
393*06c3fb27SDimitry Andric   switch (bounds->loop_type) {
394*06c3fb27SDimitry Andric   case loop_type_t::loop_type_int32:
395*06c3fb27SDimitry Andric     kmp_calc_one_iv_rectang_XX<kmp_int32>(
396*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int32> *)(bounds),
397*06c3fb27SDimitry Andric         /*in/out*/ original_ivs, iterations, ind);
398*06c3fb27SDimitry Andric     break;
399*06c3fb27SDimitry Andric   case loop_type_t::loop_type_uint32:
400*06c3fb27SDimitry Andric     kmp_calc_one_iv_rectang_XX<kmp_uint32>(
401*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint32> *)(bounds),
402*06c3fb27SDimitry Andric         /*in/out*/ original_ivs, iterations, ind);
403*06c3fb27SDimitry Andric     break;
404*06c3fb27SDimitry Andric   case loop_type_t::loop_type_int64:
405*06c3fb27SDimitry Andric     kmp_calc_one_iv_rectang_XX<kmp_int64>(
406*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int64> *)(bounds),
407*06c3fb27SDimitry Andric         /*in/out*/ original_ivs, iterations, ind);
408*06c3fb27SDimitry Andric     break;
409*06c3fb27SDimitry Andric   case loop_type_t::loop_type_uint64:
410*06c3fb27SDimitry Andric     kmp_calc_one_iv_rectang_XX<kmp_uint64>(
411*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint64> *)(bounds),
412*06c3fb27SDimitry Andric         /*in/out*/ original_ivs, iterations, ind);
413*06c3fb27SDimitry Andric     break;
414*06c3fb27SDimitry Andric   default:
415*06c3fb27SDimitry Andric     KMP_ASSERT(false);
416*06c3fb27SDimitry Andric   }
417*06c3fb27SDimitry Andric }
418*06c3fb27SDimitry Andric 
419*06c3fb27SDimitry Andric //----------------------------------------------------------------------------
420*06c3fb27SDimitry Andric // Rectangular loop nest
421*06c3fb27SDimitry Andric //----------------------------------------------------------------------------
422*06c3fb27SDimitry Andric 
423*06c3fb27SDimitry Andric //----------Canonicalize loop nest and calculate trip count-------------------
424*06c3fb27SDimitry Andric 
425*06c3fb27SDimitry Andric // Canonicalize loop nest and calculate overall trip count.
426*06c3fb27SDimitry Andric // "bounds_nest" has to be allocated per thread.
427*06c3fb27SDimitry Andric // API will modify original bounds_nest array to bring it to a canonical form
428*06c3fb27SDimitry Andric // (only <= and >=, no !=, <, >). If the original loop nest was already in a
429*06c3fb27SDimitry Andric // canonical form there will be no changes to bounds in bounds_nest array
430*06c3fb27SDimitry Andric // (only trip counts will be calculated).
431*06c3fb27SDimitry Andric // Returns trip count of overall space.
432*06c3fb27SDimitry Andric extern "C" kmp_loop_nest_iv_t
433*06c3fb27SDimitry Andric __kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
434*06c3fb27SDimitry Andric                                  /*in/out*/ bounds_info_t *original_bounds_nest,
435*06c3fb27SDimitry Andric                                  kmp_index_t n) {
436*06c3fb27SDimitry Andric 
437*06c3fb27SDimitry Andric   kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
438*06c3fb27SDimitry Andric 
439*06c3fb27SDimitry Andric   kmp_loop_nest_iv_t total = 1;
440*06c3fb27SDimitry Andric 
441*06c3fb27SDimitry Andric   for (kmp_index_t ind = 0; ind < n; ++ind) {
442*06c3fb27SDimitry Andric     auto bounds = &(original_bounds_nest[ind]);
443*06c3fb27SDimitry Andric 
444*06c3fb27SDimitry Andric     kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds);
445*06c3fb27SDimitry Andric     total *= trip_count;
446*06c3fb27SDimitry Andric   }
447*06c3fb27SDimitry Andric 
448*06c3fb27SDimitry Andric   return total;
449*06c3fb27SDimitry Andric }
450*06c3fb27SDimitry Andric 
451*06c3fb27SDimitry Andric //----------Calculate old induction variables---------------------------------
452*06c3fb27SDimitry Andric 
453*06c3fb27SDimitry Andric // Calculate old induction variables corresponding to overall new_iv.
454*06c3fb27SDimitry Andric // Note: original IV will be returned as if it had kmp_uint64 type,
455*06c3fb27SDimitry Andric // will have to be converted to original type in user code.
456*06c3fb27SDimitry Andric // Note: trip counts should be already calculated by
457*06c3fb27SDimitry Andric // __kmpc_process_loop_nest_rectang.
458*06c3fb27SDimitry Andric // OMPTODO: special case 2, 3 nested loops: either do different
459*06c3fb27SDimitry Andric // interface without array or possibly template this over n
460*06c3fb27SDimitry Andric extern "C" void
461*06c3fb27SDimitry Andric __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
462*06c3fb27SDimitry Andric                                  const bounds_info_t *original_bounds_nest,
463*06c3fb27SDimitry Andric                                  /*out*/ kmp_uint64 *original_ivs,
464*06c3fb27SDimitry Andric                                  kmp_index_t n) {
465*06c3fb27SDimitry Andric 
466*06c3fb27SDimitry Andric   kmp_iterations_t iterations =
467*06c3fb27SDimitry Andric       (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
468*06c3fb27SDimitry Andric 
469*06c3fb27SDimitry Andric   // First, calc corresponding iteration in every original loop:
470*06c3fb27SDimitry Andric   for (kmp_index_t ind = n; ind > 0;) {
471*06c3fb27SDimitry Andric     --ind;
472*06c3fb27SDimitry Andric     auto bounds = &(original_bounds_nest[ind]);
473*06c3fb27SDimitry Andric 
474*06c3fb27SDimitry Andric     // should be optimized to OPDIVREM:
475*06c3fb27SDimitry Andric     auto temp = new_iv / bounds->trip_count;
476*06c3fb27SDimitry Andric     auto iteration = new_iv % bounds->trip_count;
477*06c3fb27SDimitry Andric     new_iv = temp;
478*06c3fb27SDimitry Andric 
479*06c3fb27SDimitry Andric     iterations[ind] = iteration;
480*06c3fb27SDimitry Andric   }
481*06c3fb27SDimitry Andric   KMP_ASSERT(new_iv == 0);
482*06c3fb27SDimitry Andric 
483*06c3fb27SDimitry Andric   for (kmp_index_t ind = 0; ind < n; ++ind) {
484*06c3fb27SDimitry Andric     auto bounds = &(original_bounds_nest[ind]);
485*06c3fb27SDimitry Andric 
486*06c3fb27SDimitry Andric     kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind);
487*06c3fb27SDimitry Andric   }
488*06c3fb27SDimitry Andric   __kmp_free(iterations);
489*06c3fb27SDimitry Andric }
490*06c3fb27SDimitry Andric 
491*06c3fb27SDimitry Andric //----------------------------------------------------------------------------
492*06c3fb27SDimitry Andric // Non-rectangular loop nest
493*06c3fb27SDimitry Andric //----------------------------------------------------------------------------
494*06c3fb27SDimitry Andric 
495*06c3fb27SDimitry Andric //----------Calculate maximum possible span of iv values on one level---------
496*06c3fb27SDimitry Andric 
497*06c3fb27SDimitry Andric // Calculate span for IV on this loop level for "<=" case.
498*06c3fb27SDimitry Andric // Note: it's for <= on this loop nest level, so lower bound should be smallest
499*06c3fb27SDimitry Andric // value, upper bound should be the biggest value. If the loop won't execute,
500*06c3fb27SDimitry Andric // 'smallest' may be bigger than 'biggest', but we'd better not switch them
501*06c3fb27SDimitry Andric // around.
502*06c3fb27SDimitry Andric template <typename T>
503*06c3fb27SDimitry Andric void kmp_calc_span_lessoreq_XX(
504*06c3fb27SDimitry Andric     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
505*06c3fb27SDimitry Andric     /* in/out*/ bounds_info_internal_t *bounds_nest) {
506*06c3fb27SDimitry Andric 
507*06c3fb27SDimitry Andric   typedef typename traits_t<T>::unsigned_t UT;
508*06c3fb27SDimitry Andric   // typedef typename traits_t<T>::signed_t ST;
509*06c3fb27SDimitry Andric 
510*06c3fb27SDimitry Andric   // typedef typename big_span_t span_t;
511*06c3fb27SDimitry Andric   typedef T span_t;
512*06c3fb27SDimitry Andric 
513*06c3fb27SDimitry Andric   auto &bbounds = bounds->b;
514*06c3fb27SDimitry Andric 
515*06c3fb27SDimitry Andric   if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
516*06c3fb27SDimitry Andric     // This dimention depends on one of previous ones; can't be the outermost
517*06c3fb27SDimitry Andric     // one.
518*06c3fb27SDimitry Andric     bounds_info_internalXX_template<T> *previous =
519*06c3fb27SDimitry Andric         reinterpret_cast<bounds_info_internalXX_template<T> *>(
520*06c3fb27SDimitry Andric             &(bounds_nest[bbounds.outer_iv]));
521*06c3fb27SDimitry Andric 
522*06c3fb27SDimitry Andric     // OMPTODO: assert that T is compatible with loop variable type on
523*06c3fb27SDimitry Andric     // 'previous' loop
524*06c3fb27SDimitry Andric 
525*06c3fb27SDimitry Andric     {
526*06c3fb27SDimitry Andric       span_t bound_candidate1 =
527*06c3fb27SDimitry Andric           bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
528*06c3fb27SDimitry Andric       span_t bound_candidate2 =
529*06c3fb27SDimitry Andric           bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
530*06c3fb27SDimitry Andric       if (bound_candidate1 < bound_candidate2) {
531*06c3fb27SDimitry Andric         bounds->span_smallest = bound_candidate1;
532*06c3fb27SDimitry Andric       } else {
533*06c3fb27SDimitry Andric         bounds->span_smallest = bound_candidate2;
534*06c3fb27SDimitry Andric       }
535*06c3fb27SDimitry Andric     }
536*06c3fb27SDimitry Andric 
537*06c3fb27SDimitry Andric     {
538*06c3fb27SDimitry Andric       // We can't adjust the upper bound with respect to step, because
539*06c3fb27SDimitry Andric       // lower bound might be off after adjustments
540*06c3fb27SDimitry Andric 
541*06c3fb27SDimitry Andric       span_t bound_candidate1 =
542*06c3fb27SDimitry Andric           bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
543*06c3fb27SDimitry Andric       span_t bound_candidate2 =
544*06c3fb27SDimitry Andric           bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
545*06c3fb27SDimitry Andric       if (bound_candidate1 < bound_candidate2) {
546*06c3fb27SDimitry Andric         bounds->span_biggest = bound_candidate2;
547*06c3fb27SDimitry Andric       } else {
548*06c3fb27SDimitry Andric         bounds->span_biggest = bound_candidate1;
549*06c3fb27SDimitry Andric       }
550*06c3fb27SDimitry Andric     }
551*06c3fb27SDimitry Andric   } else {
552*06c3fb27SDimitry Andric     // Rectangular:
553*06c3fb27SDimitry Andric     bounds->span_smallest = bbounds.lb0;
554*06c3fb27SDimitry Andric     bounds->span_biggest = bbounds.ub0;
555*06c3fb27SDimitry Andric   }
556*06c3fb27SDimitry Andric   if (!bounds->loop_bounds_adjusted) {
557*06c3fb27SDimitry Andric     // Here it's safe to reduce the space to the multiply of step.
558*06c3fb27SDimitry Andric     // OMPTODO: check if the formular is correct.
559*06c3fb27SDimitry Andric     // Also check if it would be safe to do this if we didn't adjust left side.
560*06c3fb27SDimitry Andric     bounds->span_biggest -=
561*06c3fb27SDimitry Andric         (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
562*06c3fb27SDimitry Andric   }
563*06c3fb27SDimitry Andric }
564*06c3fb27SDimitry Andric 
565*06c3fb27SDimitry Andric // Calculate span for IV on this loop level for ">=" case.
566*06c3fb27SDimitry Andric template <typename T>
567*06c3fb27SDimitry Andric void kmp_calc_span_greateroreq_XX(
568*06c3fb27SDimitry Andric     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
569*06c3fb27SDimitry Andric     /* in/out*/ bounds_info_internal_t *bounds_nest) {
570*06c3fb27SDimitry Andric 
571*06c3fb27SDimitry Andric   typedef typename traits_t<T>::unsigned_t UT;
572*06c3fb27SDimitry Andric   // typedef typename traits_t<T>::signed_t ST;
573*06c3fb27SDimitry Andric 
574*06c3fb27SDimitry Andric   // typedef typename big_span_t span_t;
575*06c3fb27SDimitry Andric   typedef T span_t;
576*06c3fb27SDimitry Andric 
577*06c3fb27SDimitry Andric   auto &bbounds = bounds->b;
578*06c3fb27SDimitry Andric 
579*06c3fb27SDimitry Andric   if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
580*06c3fb27SDimitry Andric     // This dimention depends on one of previous ones; can't be the outermost
581*06c3fb27SDimitry Andric     // one.
582*06c3fb27SDimitry Andric     bounds_info_internalXX_template<T> *previous =
583*06c3fb27SDimitry Andric         reinterpret_cast<bounds_info_internalXX_template<T> *>(
584*06c3fb27SDimitry Andric             &(bounds_nest[bbounds.outer_iv]));
585*06c3fb27SDimitry Andric 
586*06c3fb27SDimitry Andric     // OMPTODO: assert that T is compatible with loop variable type on
587*06c3fb27SDimitry Andric     // 'previous' loop
588*06c3fb27SDimitry Andric 
589*06c3fb27SDimitry Andric     {
590*06c3fb27SDimitry Andric       span_t bound_candidate1 =
591*06c3fb27SDimitry Andric           bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
592*06c3fb27SDimitry Andric       span_t bound_candidate2 =
593*06c3fb27SDimitry Andric           bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
594*06c3fb27SDimitry Andric       if (bound_candidate1 >= bound_candidate2) {
595*06c3fb27SDimitry Andric         bounds->span_smallest = bound_candidate1;
596*06c3fb27SDimitry Andric       } else {
597*06c3fb27SDimitry Andric         bounds->span_smallest = bound_candidate2;
598*06c3fb27SDimitry Andric       }
599*06c3fb27SDimitry Andric     }
600*06c3fb27SDimitry Andric 
601*06c3fb27SDimitry Andric     {
602*06c3fb27SDimitry Andric       // We can't adjust the upper bound with respect to step, because
603*06c3fb27SDimitry Andric       // lower bound might be off after adjustments
604*06c3fb27SDimitry Andric 
605*06c3fb27SDimitry Andric       span_t bound_candidate1 =
606*06c3fb27SDimitry Andric           bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
607*06c3fb27SDimitry Andric       span_t bound_candidate2 =
608*06c3fb27SDimitry Andric           bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
609*06c3fb27SDimitry Andric       if (bound_candidate1 >= bound_candidate2) {
610*06c3fb27SDimitry Andric         bounds->span_biggest = bound_candidate2;
611*06c3fb27SDimitry Andric       } else {
612*06c3fb27SDimitry Andric         bounds->span_biggest = bound_candidate1;
613*06c3fb27SDimitry Andric       }
614*06c3fb27SDimitry Andric     }
615*06c3fb27SDimitry Andric 
616*06c3fb27SDimitry Andric   } else {
617*06c3fb27SDimitry Andric     // Rectangular:
618*06c3fb27SDimitry Andric     bounds->span_biggest = bbounds.lb0;
619*06c3fb27SDimitry Andric     bounds->span_smallest = bbounds.ub0;
620*06c3fb27SDimitry Andric   }
621*06c3fb27SDimitry Andric   if (!bounds->loop_bounds_adjusted) {
622*06c3fb27SDimitry Andric     // Here it's safe to reduce the space to the multiply of step.
623*06c3fb27SDimitry Andric     // OMPTODO: check if the formular is correct.
624*06c3fb27SDimitry Andric     // Also check if it would be safe to do this if we didn't adjust left side.
625*06c3fb27SDimitry Andric     bounds->span_biggest -=
626*06c3fb27SDimitry Andric         (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
627*06c3fb27SDimitry Andric   }
628*06c3fb27SDimitry Andric }
629*06c3fb27SDimitry Andric 
630*06c3fb27SDimitry Andric // Calculate maximum possible span for IV on this loop level.
631*06c3fb27SDimitry Andric template <typename T>
632*06c3fb27SDimitry Andric void kmp_calc_span_XX(
633*06c3fb27SDimitry Andric     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
634*06c3fb27SDimitry Andric     /* in/out*/ bounds_info_internal_t *bounds_nest) {
635*06c3fb27SDimitry Andric 
636*06c3fb27SDimitry Andric   if (bounds->b.comparison == comparison_t::comp_less_or_eq) {
637*06c3fb27SDimitry Andric     kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
638*06c3fb27SDimitry Andric   } else {
639*06c3fb27SDimitry Andric     KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq);
640*06c3fb27SDimitry Andric     kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
641*06c3fb27SDimitry Andric   }
642*06c3fb27SDimitry Andric }
643*06c3fb27SDimitry Andric 
644*06c3fb27SDimitry Andric //----------All initial processing of the loop nest---------------------------
645*06c3fb27SDimitry Andric 
646*06c3fb27SDimitry Andric // Calculate new bounds for this loop level.
647*06c3fb27SDimitry Andric // To be able to work with the nest we need to get it to a parallelepiped shape.
648*06c3fb27SDimitry Andric // We need to stay in the original range of values, so that there will be no
649*06c3fb27SDimitry Andric // overflow, for that we'll adjust both upper and lower bounds as needed.
650*06c3fb27SDimitry Andric template <typename T>
651*06c3fb27SDimitry Andric void kmp_calc_new_bounds_XX(
652*06c3fb27SDimitry Andric     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
653*06c3fb27SDimitry Andric     /* in/out*/ bounds_info_internal_t *bounds_nest) {
654*06c3fb27SDimitry Andric 
655*06c3fb27SDimitry Andric   auto &bbounds = bounds->b;
656*06c3fb27SDimitry Andric 
657*06c3fb27SDimitry Andric   if (bbounds.lb1 == bbounds.ub1) {
658*06c3fb27SDimitry Andric     // Already parallel, no need to adjust:
659*06c3fb27SDimitry Andric     bounds->loop_bounds_adjusted = false;
660*06c3fb27SDimitry Andric   } else {
661*06c3fb27SDimitry Andric     bounds->loop_bounds_adjusted = true;
662*06c3fb27SDimitry Andric 
663*06c3fb27SDimitry Andric     T old_lb1 = bbounds.lb1;
664*06c3fb27SDimitry Andric     T old_ub1 = bbounds.ub1;
665*06c3fb27SDimitry Andric 
666*06c3fb27SDimitry Andric     if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) {
667*06c3fb27SDimitry Andric       // With this shape we can adjust to a rectangle:
668*06c3fb27SDimitry Andric       bbounds.lb1 = 0;
669*06c3fb27SDimitry Andric       bbounds.ub1 = 0;
670*06c3fb27SDimitry Andric     } else {
671*06c3fb27SDimitry Andric       // get upper and lower bounds to be parallel
672*06c3fb27SDimitry Andric       // with values in the old range.
673*06c3fb27SDimitry Andric       // Note: abs didn't work here.
674*06c3fb27SDimitry Andric       if (((old_lb1 < 0) && (old_lb1 < old_ub1)) ||
675*06c3fb27SDimitry Andric           ((old_lb1 > 0) && (old_lb1 > old_ub1))) {
676*06c3fb27SDimitry Andric         bbounds.lb1 = old_ub1;
677*06c3fb27SDimitry Andric       } else {
678*06c3fb27SDimitry Andric         bbounds.ub1 = old_lb1;
679*06c3fb27SDimitry Andric       }
680*06c3fb27SDimitry Andric     }
681*06c3fb27SDimitry Andric 
682*06c3fb27SDimitry Andric     // Now need to adjust lb0, ub0, otherwise in some cases space will shrink.
683*06c3fb27SDimitry Andric     // The idea here that for this IV we are now getting the same span
684*06c3fb27SDimitry Andric     // irrespective of the previous IV value.
685*06c3fb27SDimitry Andric     bounds_info_internalXX_template<T> *previous =
686*06c3fb27SDimitry Andric         reinterpret_cast<bounds_info_internalXX_template<T> *>(
687*06c3fb27SDimitry Andric             &bounds_nest[bbounds.outer_iv]);
688*06c3fb27SDimitry Andric 
689*06c3fb27SDimitry Andric     if (bbounds.comparison == comparison_t::comp_less_or_eq) {
690*06c3fb27SDimitry Andric       if (old_lb1 < bbounds.lb1) {
691*06c3fb27SDimitry Andric         KMP_ASSERT(old_lb1 < 0);
692*06c3fb27SDimitry Andric         // The length is good on outer_iv biggest number,
693*06c3fb27SDimitry Andric         // can use it to find where to move the lower bound:
694*06c3fb27SDimitry Andric 
695*06c3fb27SDimitry Andric         T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest;
696*06c3fb27SDimitry Andric         bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space?
697*06c3fb27SDimitry Andric                             // e.g. it was 0?? (same below)
698*06c3fb27SDimitry Andric       } else if (old_lb1 > bbounds.lb1) {
699*06c3fb27SDimitry Andric         // still need to move lower bound:
700*06c3fb27SDimitry Andric         T add = (old_lb1 - bbounds.lb1) * previous->span_smallest;
701*06c3fb27SDimitry Andric         bbounds.lb0 += add;
702*06c3fb27SDimitry Andric       }
703*06c3fb27SDimitry Andric 
704*06c3fb27SDimitry Andric       if (old_ub1 > bbounds.ub1) {
705*06c3fb27SDimitry Andric         KMP_ASSERT(old_ub1 > 0);
706*06c3fb27SDimitry Andric         // The length is good on outer_iv biggest number,
707*06c3fb27SDimitry Andric         // can use it to find where to move upper bound:
708*06c3fb27SDimitry Andric 
709*06c3fb27SDimitry Andric         T add = (old_ub1 - bbounds.ub1) * previous->span_biggest;
710*06c3fb27SDimitry Andric         bbounds.ub0 += add;
711*06c3fb27SDimitry Andric       } else if (old_ub1 < bbounds.ub1) {
712*06c3fb27SDimitry Andric         // still need to move upper bound:
713*06c3fb27SDimitry Andric         T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest;
714*06c3fb27SDimitry Andric         bbounds.ub0 -= sub;
715*06c3fb27SDimitry Andric       }
716*06c3fb27SDimitry Andric     } else {
717*06c3fb27SDimitry Andric       KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq);
718*06c3fb27SDimitry Andric       if (old_lb1 < bbounds.lb1) {
719*06c3fb27SDimitry Andric         KMP_ASSERT(old_lb1 < 0);
720*06c3fb27SDimitry Andric         T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest;
721*06c3fb27SDimitry Andric         bbounds.lb0 -= sub;
722*06c3fb27SDimitry Andric       } else if (old_lb1 > bbounds.lb1) {
723*06c3fb27SDimitry Andric         T add = (old_lb1 - bbounds.lb1) * previous->span_biggest;
724*06c3fb27SDimitry Andric         bbounds.lb0 += add;
725*06c3fb27SDimitry Andric       }
726*06c3fb27SDimitry Andric 
727*06c3fb27SDimitry Andric       if (old_ub1 > bbounds.ub1) {
728*06c3fb27SDimitry Andric         KMP_ASSERT(old_ub1 > 0);
729*06c3fb27SDimitry Andric         T add = (old_ub1 - bbounds.ub1) * previous->span_smallest;
730*06c3fb27SDimitry Andric         bbounds.ub0 += add;
731*06c3fb27SDimitry Andric       } else if (old_ub1 < bbounds.ub1) {
732*06c3fb27SDimitry Andric         T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest;
733*06c3fb27SDimitry Andric         bbounds.ub0 -= sub;
734*06c3fb27SDimitry Andric       }
735*06c3fb27SDimitry Andric     }
736*06c3fb27SDimitry Andric   }
737*06c3fb27SDimitry Andric }
738*06c3fb27SDimitry Andric 
739*06c3fb27SDimitry Andric // Do all processing for one canonicalized loop in the nest
740*06c3fb27SDimitry Andric // (assuming that outer loops already were processed):
741*06c3fb27SDimitry Andric template <typename T>
742*06c3fb27SDimitry Andric kmp_loop_nest_iv_t kmp_process_one_loop_XX(
743*06c3fb27SDimitry Andric     /* in/out*/ bounds_info_internalXX_template<T> *bounds,
744*06c3fb27SDimitry Andric     /*in/out*/ bounds_info_internal_t *bounds_nest) {
745*06c3fb27SDimitry Andric 
746*06c3fb27SDimitry Andric   kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
747*06c3fb27SDimitry Andric   kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
748*06c3fb27SDimitry Andric   return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b));
749*06c3fb27SDimitry Andric }
750*06c3fb27SDimitry Andric 
751*06c3fb27SDimitry Andric // Non-rectangular loop nest, canonicalized to use <= or >=.
752*06c3fb27SDimitry Andric // Process loop nest to have a parallelepiped shape,
753*06c3fb27SDimitry Andric // calculate biggest spans for IV's on all levels and calculate overall trip
754*06c3fb27SDimitry Andric // count. "bounds_nest" has to be allocated per thread.
755*06c3fb27SDimitry Andric // Returns overall trip count (for adjusted space).
756*06c3fb27SDimitry Andric kmp_loop_nest_iv_t kmp_process_loop_nest(
757*06c3fb27SDimitry Andric     /*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) {
758*06c3fb27SDimitry Andric 
759*06c3fb27SDimitry Andric   kmp_loop_nest_iv_t total = 1;
760*06c3fb27SDimitry Andric 
761*06c3fb27SDimitry Andric   for (kmp_index_t ind = 0; ind < n; ++ind) {
762*06c3fb27SDimitry Andric     auto bounds = &(bounds_nest[ind]);
763*06c3fb27SDimitry Andric     kmp_loop_nest_iv_t trip_count = 0;
764*06c3fb27SDimitry Andric 
765*06c3fb27SDimitry Andric     switch (bounds->b.loop_type) {
766*06c3fb27SDimitry Andric     case loop_type_t::loop_type_int32:
767*06c3fb27SDimitry Andric       trip_count = kmp_process_one_loop_XX<kmp_int32>(
768*06c3fb27SDimitry Andric           /*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds),
769*06c3fb27SDimitry Andric           /*in/out*/ bounds_nest);
770*06c3fb27SDimitry Andric       break;
771*06c3fb27SDimitry Andric     case loop_type_t::loop_type_uint32:
772*06c3fb27SDimitry Andric       trip_count = kmp_process_one_loop_XX<kmp_uint32>(
773*06c3fb27SDimitry Andric           /*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds),
774*06c3fb27SDimitry Andric           /*in/out*/ bounds_nest);
775*06c3fb27SDimitry Andric       break;
776*06c3fb27SDimitry Andric     case loop_type_t::loop_type_int64:
777*06c3fb27SDimitry Andric       trip_count = kmp_process_one_loop_XX<kmp_int64>(
778*06c3fb27SDimitry Andric           /*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds),
779*06c3fb27SDimitry Andric           /*in/out*/ bounds_nest);
780*06c3fb27SDimitry Andric       break;
781*06c3fb27SDimitry Andric     case loop_type_t::loop_type_uint64:
782*06c3fb27SDimitry Andric       trip_count = kmp_process_one_loop_XX<kmp_uint64>(
783*06c3fb27SDimitry Andric           /*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds),
784*06c3fb27SDimitry Andric           /*in/out*/ bounds_nest);
785*06c3fb27SDimitry Andric       break;
786*06c3fb27SDimitry Andric     default:
787*06c3fb27SDimitry Andric       KMP_ASSERT(false);
788*06c3fb27SDimitry Andric     }
789*06c3fb27SDimitry Andric     total *= trip_count;
790*06c3fb27SDimitry Andric   }
791*06c3fb27SDimitry Andric 
792*06c3fb27SDimitry Andric   return total;
793*06c3fb27SDimitry Andric }
794*06c3fb27SDimitry Andric 
795*06c3fb27SDimitry Andric //----------Calculate iterations (in the original or updated space)-----------
796*06c3fb27SDimitry Andric 
797*06c3fb27SDimitry Andric // Calculate number of iterations in original or updated space resulting in
798*06c3fb27SDimitry Andric // original_ivs[ind] (only on this level, non-negative)
799*06c3fb27SDimitry Andric // (not counting initial iteration)
800*06c3fb27SDimitry Andric template <typename T>
801*06c3fb27SDimitry Andric kmp_loop_nest_iv_t
802*06c3fb27SDimitry Andric kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds,
803*06c3fb27SDimitry Andric                                  const kmp_point_t original_ivs,
804*06c3fb27SDimitry Andric                                  kmp_index_t ind) {
805*06c3fb27SDimitry Andric 
806*06c3fb27SDimitry Andric   kmp_loop_nest_iv_t iterations = 0;
807*06c3fb27SDimitry Andric 
808*06c3fb27SDimitry Andric   if (bounds->comparison == comparison_t::comp_less_or_eq) {
809*06c3fb27SDimitry Andric     iterations =
810*06c3fb27SDimitry Andric         (static_cast<T>(original_ivs[ind]) - bounds->lb0 -
811*06c3fb27SDimitry Andric          bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) /
812*06c3fb27SDimitry Andric         __kmp_abs(bounds->step);
813*06c3fb27SDimitry Andric   } else {
814*06c3fb27SDimitry Andric     KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq);
815*06c3fb27SDimitry Andric     iterations = (bounds->lb0 +
816*06c3fb27SDimitry Andric                   bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) -
817*06c3fb27SDimitry Andric                   static_cast<T>(original_ivs[ind])) /
818*06c3fb27SDimitry Andric                  __kmp_abs(bounds->step);
819*06c3fb27SDimitry Andric   }
820*06c3fb27SDimitry Andric 
821*06c3fb27SDimitry Andric   return iterations;
822*06c3fb27SDimitry Andric }
823*06c3fb27SDimitry Andric 
824*06c3fb27SDimitry Andric // Calculate number of iterations in the original or updated space resulting in
825*06c3fb27SDimitry Andric // original_ivs[ind] (only on this level, non-negative)
826*06c3fb27SDimitry Andric kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds,
827*06c3fb27SDimitry Andric                                                  const kmp_point_t original_ivs,
828*06c3fb27SDimitry Andric                                                  kmp_index_t ind) {
829*06c3fb27SDimitry Andric 
830*06c3fb27SDimitry Andric   switch (bounds->loop_type) {
831*06c3fb27SDimitry Andric   case loop_type_t::loop_type_int32:
832*06c3fb27SDimitry Andric     return kmp_calc_number_of_iterations_XX<kmp_int32>(
833*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind);
834*06c3fb27SDimitry Andric     break;
835*06c3fb27SDimitry Andric   case loop_type_t::loop_type_uint32:
836*06c3fb27SDimitry Andric     return kmp_calc_number_of_iterations_XX<kmp_uint32>(
837*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind);
838*06c3fb27SDimitry Andric     break;
839*06c3fb27SDimitry Andric   case loop_type_t::loop_type_int64:
840*06c3fb27SDimitry Andric     return kmp_calc_number_of_iterations_XX<kmp_int64>(
841*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind);
842*06c3fb27SDimitry Andric     break;
843*06c3fb27SDimitry Andric   case loop_type_t::loop_type_uint64:
844*06c3fb27SDimitry Andric     return kmp_calc_number_of_iterations_XX<kmp_uint64>(
845*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind);
846*06c3fb27SDimitry Andric     break;
847*06c3fb27SDimitry Andric   default:
848*06c3fb27SDimitry Andric     KMP_ASSERT(false);
849*06c3fb27SDimitry Andric     return 0;
850*06c3fb27SDimitry Andric   }
851*06c3fb27SDimitry Andric }
852*06c3fb27SDimitry Andric 
853*06c3fb27SDimitry Andric //----------Calculate new iv corresponding to original ivs--------------------
854*06c3fb27SDimitry Andric 
855*06c3fb27SDimitry Andric // We got a point in the original loop nest.
856*06c3fb27SDimitry Andric // Take updated bounds and calculate what new_iv will correspond to this point.
857*06c3fb27SDimitry Andric // When we are getting original IVs from new_iv, we have to adjust to fit into
858*06c3fb27SDimitry Andric // original loops bounds. Getting new_iv for the adjusted original IVs will help
859*06c3fb27SDimitry Andric // with making more chunks non-empty.
860*06c3fb27SDimitry Andric kmp_loop_nest_iv_t
861*06c3fb27SDimitry Andric kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest,
862*06c3fb27SDimitry Andric                                   const kmp_point_t original_ivs,
863*06c3fb27SDimitry Andric                                   kmp_index_t n) {
864*06c3fb27SDimitry Andric 
865*06c3fb27SDimitry Andric   kmp_loop_nest_iv_t new_iv = 0;
866*06c3fb27SDimitry Andric 
867*06c3fb27SDimitry Andric   for (kmp_index_t ind = 0; ind < n; ++ind) {
868*06c3fb27SDimitry Andric     auto bounds = &(bounds_nest[ind].b);
869*06c3fb27SDimitry Andric 
870*06c3fb27SDimitry Andric     new_iv = new_iv * bounds->trip_count +
871*06c3fb27SDimitry Andric              kmp_calc_number_of_iterations(bounds, original_ivs, ind);
872*06c3fb27SDimitry Andric   }
873*06c3fb27SDimitry Andric 
874*06c3fb27SDimitry Andric   return new_iv;
875*06c3fb27SDimitry Andric }
876*06c3fb27SDimitry Andric 
877*06c3fb27SDimitry Andric //----------Calculate original ivs for provided iterations--------------------
878*06c3fb27SDimitry Andric 
879*06c3fb27SDimitry Andric // Calculate original IVs for provided iterations, assuming iterations are
880*06c3fb27SDimitry Andric // calculated in the original space.
881*06c3fb27SDimitry Andric // Loop nest is in canonical form (with <= / >=).
882*06c3fb27SDimitry Andric bool kmp_calc_original_ivs_from_iterations(
883*06c3fb27SDimitry Andric     const bounds_info_t *original_bounds_nest, kmp_index_t n,
884*06c3fb27SDimitry Andric     /*in/out*/ kmp_point_t original_ivs,
885*06c3fb27SDimitry Andric     /*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) {
886*06c3fb27SDimitry Andric 
887*06c3fb27SDimitry Andric   kmp_index_t lengthened_ind = n;
888*06c3fb27SDimitry Andric 
889*06c3fb27SDimitry Andric   for (; ind < n;) {
890*06c3fb27SDimitry Andric     auto bounds = &(original_bounds_nest[ind]);
891*06c3fb27SDimitry Andric     bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations,
892*06c3fb27SDimitry Andric                                 ind, (lengthened_ind < ind), true);
893*06c3fb27SDimitry Andric 
894*06c3fb27SDimitry Andric     if (!good) {
895*06c3fb27SDimitry Andric       // The calculated iv value is too big (or too small for >=):
896*06c3fb27SDimitry Andric       if (ind == 0) {
897*06c3fb27SDimitry Andric         // Space is empty:
898*06c3fb27SDimitry Andric         return false;
899*06c3fb27SDimitry Andric       } else {
900*06c3fb27SDimitry Andric         // Go to next iteration on the outer loop:
901*06c3fb27SDimitry Andric         --ind;
902*06c3fb27SDimitry Andric         ++iterations[ind];
903*06c3fb27SDimitry Andric         lengthened_ind = ind;
904*06c3fb27SDimitry Andric         for (kmp_index_t i = ind + 1; i < n; ++i) {
905*06c3fb27SDimitry Andric           iterations[i] = 0;
906*06c3fb27SDimitry Andric         }
907*06c3fb27SDimitry Andric         continue;
908*06c3fb27SDimitry Andric       }
909*06c3fb27SDimitry Andric     }
910*06c3fb27SDimitry Andric     ++ind;
911*06c3fb27SDimitry Andric   }
912*06c3fb27SDimitry Andric 
913*06c3fb27SDimitry Andric   return true;
914*06c3fb27SDimitry Andric }
915*06c3fb27SDimitry Andric 
916*06c3fb27SDimitry Andric //----------Calculate original ivs for the beginning of the loop nest---------
917*06c3fb27SDimitry Andric 
918*06c3fb27SDimitry Andric // Calculate IVs for the beginning of the loop nest.
919*06c3fb27SDimitry Andric // Note: lower bounds of all loops may not work -
920*06c3fb27SDimitry Andric // if on some of the iterations of the outer loops inner loops are empty.
921*06c3fb27SDimitry Andric // Loop nest is in canonical form (with <= / >=).
922*06c3fb27SDimitry Andric bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
923*06c3fb27SDimitry Andric                                      kmp_index_t n,
924*06c3fb27SDimitry Andric                                      /*out*/ kmp_point_t original_ivs) {
925*06c3fb27SDimitry Andric 
926*06c3fb27SDimitry Andric   // Iterations in the original space, multiplied by step:
927*06c3fb27SDimitry Andric   kmp_iterations_t iterations =
928*06c3fb27SDimitry Andric       (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
929*06c3fb27SDimitry Andric 
930*06c3fb27SDimitry Andric   for (kmp_index_t ind = n; ind > 0;) {
931*06c3fb27SDimitry Andric     --ind;
932*06c3fb27SDimitry Andric     iterations[ind] = 0;
933*06c3fb27SDimitry Andric   }
934*06c3fb27SDimitry Andric 
935*06c3fb27SDimitry Andric   // Now calculate the point:
936*06c3fb27SDimitry Andric   bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,
937*06c3fb27SDimitry Andric                                                  /*in/out*/ original_ivs,
938*06c3fb27SDimitry Andric                                                  /*in/out*/ iterations, 0);
939*06c3fb27SDimitry Andric   __kmp_free(iterations);
940*06c3fb27SDimitry Andric   return b;
941*06c3fb27SDimitry Andric }
942*06c3fb27SDimitry Andric 
943*06c3fb27SDimitry Andric //----------Calculate next point in the original loop space-------------------
944*06c3fb27SDimitry Andric 
945*06c3fb27SDimitry Andric // From current set of original IVs calculate next point.
946*06c3fb27SDimitry Andric // Return false if there is no next point in the loop bounds.
947*06c3fb27SDimitry Andric bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
948*06c3fb27SDimitry Andric                                 kmp_index_t n, const kmp_point_t original_ivs,
949*06c3fb27SDimitry Andric                                 /*out*/ kmp_point_t next_original_ivs) {
950*06c3fb27SDimitry Andric   // Iterations in the original space, multiplied by step (so can be negative):
951*06c3fb27SDimitry Andric   kmp_iterations_t iterations =
952*06c3fb27SDimitry Andric       (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
953*06c3fb27SDimitry Andric 
954*06c3fb27SDimitry Andric   // First, calc corresponding iteration in every original loop:
955*06c3fb27SDimitry Andric   for (kmp_index_t ind = 0; ind < n; ++ind) {
956*06c3fb27SDimitry Andric     auto bounds = &(original_bounds_nest[ind]);
957*06c3fb27SDimitry Andric     iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind);
958*06c3fb27SDimitry Andric   }
959*06c3fb27SDimitry Andric 
960*06c3fb27SDimitry Andric   for (kmp_index_t ind = 0; ind < n; ++ind) {
961*06c3fb27SDimitry Andric     next_original_ivs[ind] = original_ivs[ind];
962*06c3fb27SDimitry Andric   }
963*06c3fb27SDimitry Andric 
964*06c3fb27SDimitry Andric   // Next add one step to the iterations on the inner-most level, and see if we
965*06c3fb27SDimitry Andric   // need to move up the nest:
966*06c3fb27SDimitry Andric   kmp_index_t ind = n - 1;
967*06c3fb27SDimitry Andric   ++iterations[ind];
968*06c3fb27SDimitry Andric 
969*06c3fb27SDimitry Andric   bool b = kmp_calc_original_ivs_from_iterations(
970*06c3fb27SDimitry Andric       original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind);
971*06c3fb27SDimitry Andric 
972*06c3fb27SDimitry Andric   __kmp_free(iterations);
973*06c3fb27SDimitry Andric   return b;
974*06c3fb27SDimitry Andric }
975*06c3fb27SDimitry Andric 
976*06c3fb27SDimitry Andric //----------Calculate chunk end in the original loop space--------------------
977*06c3fb27SDimitry Andric 
978*06c3fb27SDimitry Andric // For one level calculate old induction variable corresponding to overall
979*06c3fb27SDimitry Andric // new_iv for the chunk end.
980*06c3fb27SDimitry Andric // Return true if it fits into upper bound on this level
981*06c3fb27SDimitry Andric // (if not, we need to re-calculate)
982*06c3fb27SDimitry Andric template <typename T>
983*06c3fb27SDimitry Andric bool kmp_calc_one_iv_for_chunk_end_XX(
984*06c3fb27SDimitry Andric     const bounds_infoXX_template<T> *bounds,
985*06c3fb27SDimitry Andric     const bounds_infoXX_template<T> *updated_bounds,
986*06c3fb27SDimitry Andric     /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations,
987*06c3fb27SDimitry Andric     kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start,
988*06c3fb27SDimitry Andric     const kmp_point_t original_ivs_start) {
989*06c3fb27SDimitry Andric 
990*06c3fb27SDimitry Andric   // typedef  std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
991*06c3fb27SDimitry Andric   // big_span_t;
992*06c3fb27SDimitry Andric 
993*06c3fb27SDimitry Andric   // OMPTODO: is it good enough, or do we need ST or do we need big_span_t?
994*06c3fb27SDimitry Andric   T temp = 0;
995*06c3fb27SDimitry Andric 
996*06c3fb27SDimitry Andric   T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
997*06c3fb27SDimitry Andric 
998*06c3fb27SDimitry Andric   if (start_with_lower_bound) {
999*06c3fb27SDimitry Andric     // we moved to the next iteration on one of outer loops, may as well use
1000*06c3fb27SDimitry Andric     // the lower bound here:
1001*06c3fb27SDimitry Andric     temp = bounds->lb0 + bounds->lb1 * outer_iv;
1002*06c3fb27SDimitry Andric   } else {
1003*06c3fb27SDimitry Andric     // Start in expanded space, but:
1004*06c3fb27SDimitry Andric     // - we need to hit original space lower bound, so need to account for
1005*06c3fb27SDimitry Andric     // that
1006*06c3fb27SDimitry Andric     // - we have to go into original space, even if that means adding more
1007*06c3fb27SDimitry Andric     // iterations than was planned
1008*06c3fb27SDimitry Andric     // - we have to go past (or equal to) previous point (which is the chunk
1009*06c3fb27SDimitry Andric     // starting point)
1010*06c3fb27SDimitry Andric 
1011*06c3fb27SDimitry Andric     auto iteration = iterations[ind];
1012*06c3fb27SDimitry Andric 
1013*06c3fb27SDimitry Andric     auto step = bounds->step;
1014*06c3fb27SDimitry Andric 
1015*06c3fb27SDimitry Andric     // In case of >= it's negative:
1016*06c3fb27SDimitry Andric     auto accountForStep =
1017*06c3fb27SDimitry Andric         ((bounds->lb0 + bounds->lb1 * outer_iv) -
1018*06c3fb27SDimitry Andric          (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) %
1019*06c3fb27SDimitry Andric         step;
1020*06c3fb27SDimitry Andric 
1021*06c3fb27SDimitry Andric     temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv +
1022*06c3fb27SDimitry Andric            accountForStep + iteration * step;
1023*06c3fb27SDimitry Andric 
1024*06c3fb27SDimitry Andric     if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1025*06c3fb27SDimitry Andric          (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) ||
1026*06c3fb27SDimitry Andric         ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1027*06c3fb27SDimitry Andric          (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) {
1028*06c3fb27SDimitry Andric       // Too small (or too big), didn't reach the original lower bound. Use
1029*06c3fb27SDimitry Andric       // heuristic:
1030*06c3fb27SDimitry Andric       temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step;
1031*06c3fb27SDimitry Andric     }
1032*06c3fb27SDimitry Andric 
1033*06c3fb27SDimitry Andric     if (compare_with_start) {
1034*06c3fb27SDimitry Andric 
1035*06c3fb27SDimitry Andric       T start = static_cast<T>(original_ivs_start[ind]);
1036*06c3fb27SDimitry Andric 
1037*06c3fb27SDimitry Andric       temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1038*06c3fb27SDimitry Andric 
1039*06c3fb27SDimitry Andric       // On all previous levels start of the chunk is same as the end, need to
1040*06c3fb27SDimitry Andric       // be really careful here:
1041*06c3fb27SDimitry Andric       if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1042*06c3fb27SDimitry Andric            (temp < start)) ||
1043*06c3fb27SDimitry Andric           ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1044*06c3fb27SDimitry Andric            (temp > start))) {
1045*06c3fb27SDimitry Andric         // End of the chunk can't be smaller (for >= bigger) than it's start.
1046*06c3fb27SDimitry Andric         // Use heuristic:
1047*06c3fb27SDimitry Andric         temp = start + iteration / 4 * step;
1048*06c3fb27SDimitry Andric       }
1049*06c3fb27SDimitry Andric     }
1050*06c3fb27SDimitry Andric   }
1051*06c3fb27SDimitry Andric 
1052*06c3fb27SDimitry Andric   original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1053*06c3fb27SDimitry Andric 
1054*06c3fb27SDimitry Andric   if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1055*06c3fb27SDimitry Andric        (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
1056*06c3fb27SDimitry Andric       ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1057*06c3fb27SDimitry Andric        (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
1058*06c3fb27SDimitry Andric     // Too big (or too small for >=).
1059*06c3fb27SDimitry Andric     return false;
1060*06c3fb27SDimitry Andric   }
1061*06c3fb27SDimitry Andric 
1062*06c3fb27SDimitry Andric   return true;
1063*06c3fb27SDimitry Andric }
1064*06c3fb27SDimitry Andric 
1065*06c3fb27SDimitry Andric // For one level calculate old induction variable corresponding to overall
1066*06c3fb27SDimitry Andric // new_iv for the chunk end.
1067*06c3fb27SDimitry Andric bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds,
1068*06c3fb27SDimitry Andric                                    const bounds_info_t *updated_bounds,
1069*06c3fb27SDimitry Andric                                    /*in/out*/ kmp_point_t original_ivs,
1070*06c3fb27SDimitry Andric                                    const kmp_iterations_t iterations,
1071*06c3fb27SDimitry Andric                                    kmp_index_t ind, bool start_with_lower_bound,
1072*06c3fb27SDimitry Andric                                    bool compare_with_start,
1073*06c3fb27SDimitry Andric                                    const kmp_point_t original_ivs_start) {
1074*06c3fb27SDimitry Andric 
1075*06c3fb27SDimitry Andric   switch (bounds->loop_type) {
1076*06c3fb27SDimitry Andric   case loop_type_t::loop_type_int32:
1077*06c3fb27SDimitry Andric     return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>(
1078*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int32> *)(bounds),
1079*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int32> *)(updated_bounds),
1080*06c3fb27SDimitry Andric         /*in/out*/
1081*06c3fb27SDimitry Andric         original_ivs, iterations, ind, start_with_lower_bound,
1082*06c3fb27SDimitry Andric         compare_with_start, original_ivs_start);
1083*06c3fb27SDimitry Andric     break;
1084*06c3fb27SDimitry Andric   case loop_type_t::loop_type_uint32:
1085*06c3fb27SDimitry Andric     return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>(
1086*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint32> *)(bounds),
1087*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint32> *)(updated_bounds),
1088*06c3fb27SDimitry Andric         /*in/out*/
1089*06c3fb27SDimitry Andric         original_ivs, iterations, ind, start_with_lower_bound,
1090*06c3fb27SDimitry Andric         compare_with_start, original_ivs_start);
1091*06c3fb27SDimitry Andric     break;
1092*06c3fb27SDimitry Andric   case loop_type_t::loop_type_int64:
1093*06c3fb27SDimitry Andric     return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>(
1094*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int64> *)(bounds),
1095*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int64> *)(updated_bounds),
1096*06c3fb27SDimitry Andric         /*in/out*/
1097*06c3fb27SDimitry Andric         original_ivs, iterations, ind, start_with_lower_bound,
1098*06c3fb27SDimitry Andric         compare_with_start, original_ivs_start);
1099*06c3fb27SDimitry Andric     break;
1100*06c3fb27SDimitry Andric   case loop_type_t::loop_type_uint64:
1101*06c3fb27SDimitry Andric     return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>(
1102*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint64> *)(bounds),
1103*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint64> *)(updated_bounds),
1104*06c3fb27SDimitry Andric         /*in/out*/
1105*06c3fb27SDimitry Andric         original_ivs, iterations, ind, start_with_lower_bound,
1106*06c3fb27SDimitry Andric         compare_with_start, original_ivs_start);
1107*06c3fb27SDimitry Andric     break;
1108*06c3fb27SDimitry Andric   default:
1109*06c3fb27SDimitry Andric     KMP_ASSERT(false);
1110*06c3fb27SDimitry Andric     return false;
1111*06c3fb27SDimitry Andric   }
1112*06c3fb27SDimitry Andric }
1113*06c3fb27SDimitry Andric 
1114*06c3fb27SDimitry Andric // Calculate old induction variables corresponding to overall new_iv for the
1115*06c3fb27SDimitry Andric // chunk end. If due to space extension we are getting old IVs outside of the
1116*06c3fb27SDimitry Andric // boundaries, bring them into the boundaries. Need to do this in the runtime,
1117*06c3fb27SDimitry Andric // esp. on the lower bounds side. When getting result need to make sure that the
1118*06c3fb27SDimitry Andric // new chunk starts at next position to old chunk, not overlaps with it (this is
1119*06c3fb27SDimitry Andric // done elsewhere), and need to make sure end of the chunk is further than the
1120*06c3fb27SDimitry Andric // beginning of the chunk. We don't need an exact ending point here, just
1121*06c3fb27SDimitry Andric // something more-or-less close to the desired chunk length, bigger is fine
1122*06c3fb27SDimitry Andric // (smaller would be fine, but we risk going into infinite loop, so do smaller
1123*06c3fb27SDimitry Andric // only at the very end of the space). result: false if could not find the
1124*06c3fb27SDimitry Andric // ending point in the original loop space. In this case the caller can use
1125*06c3fb27SDimitry Andric // original upper bounds as the end of the chunk. Chunk won't be empty, because
1126*06c3fb27SDimitry Andric // it'll have at least the starting point, which is by construction in the
1127*06c3fb27SDimitry Andric // original space.
1128*06c3fb27SDimitry Andric bool kmp_calc_original_ivs_for_chunk_end(
1129*06c3fb27SDimitry Andric     const bounds_info_t *original_bounds_nest, kmp_index_t n,
1130*06c3fb27SDimitry Andric     const bounds_info_internal_t *updated_bounds_nest,
1131*06c3fb27SDimitry Andric     const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv,
1132*06c3fb27SDimitry Andric     /*out*/ kmp_point_t original_ivs) {
1133*06c3fb27SDimitry Andric 
1134*06c3fb27SDimitry Andric   // Iterations in the expanded space:
1135*06c3fb27SDimitry Andric   kmp_iterations_t iterations =
1136*06c3fb27SDimitry Andric       (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
1137*06c3fb27SDimitry Andric 
1138*06c3fb27SDimitry Andric   // First, calc corresponding iteration in every modified loop:
1139*06c3fb27SDimitry Andric   for (kmp_index_t ind = n; ind > 0;) {
1140*06c3fb27SDimitry Andric     --ind;
1141*06c3fb27SDimitry Andric     auto &updated_bounds = updated_bounds_nest[ind];
1142*06c3fb27SDimitry Andric 
1143*06c3fb27SDimitry Andric     // should be optimized to OPDIVREM:
1144*06c3fb27SDimitry Andric     auto new_ind = new_iv / updated_bounds.b.trip_count;
1145*06c3fb27SDimitry Andric     auto iteration = new_iv % updated_bounds.b.trip_count;
1146*06c3fb27SDimitry Andric 
1147*06c3fb27SDimitry Andric     new_iv = new_ind;
1148*06c3fb27SDimitry Andric     iterations[ind] = iteration;
1149*06c3fb27SDimitry Andric   }
1150*06c3fb27SDimitry Andric   KMP_DEBUG_ASSERT(new_iv == 0);
1151*06c3fb27SDimitry Andric 
1152*06c3fb27SDimitry Andric   kmp_index_t lengthened_ind = n;
1153*06c3fb27SDimitry Andric   kmp_index_t equal_ind = -1;
1154*06c3fb27SDimitry Andric 
1155*06c3fb27SDimitry Andric   // Next calculate the point, but in original loop nest.
1156*06c3fb27SDimitry Andric   for (kmp_index_t ind = 0; ind < n;) {
1157*06c3fb27SDimitry Andric     auto bounds = &(original_bounds_nest[ind]);
1158*06c3fb27SDimitry Andric     auto updated_bounds = &(updated_bounds_nest[ind].b);
1159*06c3fb27SDimitry Andric 
1160*06c3fb27SDimitry Andric     bool good = kmp_calc_one_iv_for_chunk_end(
1161*06c3fb27SDimitry Andric         bounds, updated_bounds,
1162*06c3fb27SDimitry Andric         /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind),
1163*06c3fb27SDimitry Andric         (equal_ind >= ind - 1), original_ivs_start);
1164*06c3fb27SDimitry Andric 
1165*06c3fb27SDimitry Andric     if (!good) {
1166*06c3fb27SDimitry Andric       // Too big (or too small for >=).
1167*06c3fb27SDimitry Andric       if (ind == 0) {
1168*06c3fb27SDimitry Andric         // Need to reduce to the end.
1169*06c3fb27SDimitry Andric         __kmp_free(iterations);
1170*06c3fb27SDimitry Andric         return false;
1171*06c3fb27SDimitry Andric       } else {
1172*06c3fb27SDimitry Andric         // Go to next iteration on outer loop:
1173*06c3fb27SDimitry Andric         --ind;
1174*06c3fb27SDimitry Andric         ++(iterations[ind]);
1175*06c3fb27SDimitry Andric         lengthened_ind = ind;
1176*06c3fb27SDimitry Andric         if (equal_ind >= lengthened_ind) {
1177*06c3fb27SDimitry Andric           // We've changed the number of iterations here,
1178*06c3fb27SDimitry Andric           // can't be same anymore:
1179*06c3fb27SDimitry Andric           equal_ind = lengthened_ind - 1;
1180*06c3fb27SDimitry Andric         }
1181*06c3fb27SDimitry Andric         for (kmp_index_t i = ind + 1; i < n; ++i) {
1182*06c3fb27SDimitry Andric           iterations[i] = 0;
1183*06c3fb27SDimitry Andric         }
1184*06c3fb27SDimitry Andric         continue;
1185*06c3fb27SDimitry Andric       }
1186*06c3fb27SDimitry Andric     }
1187*06c3fb27SDimitry Andric 
1188*06c3fb27SDimitry Andric     if ((equal_ind == ind - 1) &&
1189*06c3fb27SDimitry Andric         (kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1190*06c3fb27SDimitry Andric                     original_ivs_start[ind]))) {
1191*06c3fb27SDimitry Andric       equal_ind = ind;
1192*06c3fb27SDimitry Andric     } else if ((equal_ind > ind - 1) &&
1193*06c3fb27SDimitry Andric                !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1194*06c3fb27SDimitry Andric                             original_ivs_start[ind]))) {
1195*06c3fb27SDimitry Andric       equal_ind = ind - 1;
1196*06c3fb27SDimitry Andric     }
1197*06c3fb27SDimitry Andric     ++ind;
1198*06c3fb27SDimitry Andric   }
1199*06c3fb27SDimitry Andric 
1200*06c3fb27SDimitry Andric   __kmp_free(iterations);
1201*06c3fb27SDimitry Andric   return true;
1202*06c3fb27SDimitry Andric }
1203*06c3fb27SDimitry Andric 
1204*06c3fb27SDimitry Andric //----------Calculate upper bounds for the last chunk-------------------------
1205*06c3fb27SDimitry Andric 
1206*06c3fb27SDimitry Andric // Calculate one upper bound for the end.
1207*06c3fb27SDimitry Andric template <typename T>
1208*06c3fb27SDimitry Andric void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds,
1209*06c3fb27SDimitry Andric                             /*in/out*/ kmp_point_t original_ivs,
1210*06c3fb27SDimitry Andric                             kmp_index_t ind) {
1211*06c3fb27SDimitry Andric 
1212*06c3fb27SDimitry Andric   T temp = bounds->ub0 +
1213*06c3fb27SDimitry Andric            bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]);
1214*06c3fb27SDimitry Andric 
1215*06c3fb27SDimitry Andric   original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
1216*06c3fb27SDimitry Andric }
1217*06c3fb27SDimitry Andric 
1218*06c3fb27SDimitry Andric void kmp_calc_one_iv_end(const bounds_info_t *bounds,
1219*06c3fb27SDimitry Andric                          /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) {
1220*06c3fb27SDimitry Andric 
1221*06c3fb27SDimitry Andric   switch (bounds->loop_type) {
1222*06c3fb27SDimitry Andric   default:
1223*06c3fb27SDimitry Andric     KMP_ASSERT(false);
1224*06c3fb27SDimitry Andric     break;
1225*06c3fb27SDimitry Andric   case loop_type_t::loop_type_int32:
1226*06c3fb27SDimitry Andric     kmp_calc_one_iv_end_XX<kmp_int32>(
1227*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int32> *)(bounds),
1228*06c3fb27SDimitry Andric         /*in/out*/ original_ivs, ind);
1229*06c3fb27SDimitry Andric     break;
1230*06c3fb27SDimitry Andric   case loop_type_t::loop_type_uint32:
1231*06c3fb27SDimitry Andric     kmp_calc_one_iv_end_XX<kmp_uint32>(
1232*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint32> *)(bounds),
1233*06c3fb27SDimitry Andric         /*in/out*/ original_ivs, ind);
1234*06c3fb27SDimitry Andric     break;
1235*06c3fb27SDimitry Andric   case loop_type_t::loop_type_int64:
1236*06c3fb27SDimitry Andric     kmp_calc_one_iv_end_XX<kmp_int64>(
1237*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_int64> *)(bounds),
1238*06c3fb27SDimitry Andric         /*in/out*/ original_ivs, ind);
1239*06c3fb27SDimitry Andric     break;
1240*06c3fb27SDimitry Andric   case loop_type_t::loop_type_uint64:
1241*06c3fb27SDimitry Andric     kmp_calc_one_iv_end_XX<kmp_uint64>(
1242*06c3fb27SDimitry Andric         (bounds_infoXX_template<kmp_uint64> *)(bounds),
1243*06c3fb27SDimitry Andric         /*in/out*/ original_ivs, ind);
1244*06c3fb27SDimitry Andric     break;
1245*06c3fb27SDimitry Andric   }
1246*06c3fb27SDimitry Andric }
1247*06c3fb27SDimitry Andric 
1248*06c3fb27SDimitry Andric // Calculate upper bounds for the last loop iteration. Just use original upper
1249*06c3fb27SDimitry Andric // bounds (adjusted when canonicalized to use <= / >=). No need to check that
1250*06c3fb27SDimitry Andric // this point is in the original space (it's likely not)
1251*06c3fb27SDimitry Andric void kmp_calc_original_ivs_for_end(
1252*06c3fb27SDimitry Andric     const bounds_info_t *const original_bounds_nest, kmp_index_t n,
1253*06c3fb27SDimitry Andric     /*out*/ kmp_point_t original_ivs) {
1254*06c3fb27SDimitry Andric   for (kmp_index_t ind = 0; ind < n; ++ind) {
1255*06c3fb27SDimitry Andric     auto bounds = &(original_bounds_nest[ind]);
1256*06c3fb27SDimitry Andric     kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind);
1257*06c3fb27SDimitry Andric   }
1258*06c3fb27SDimitry Andric }
1259*06c3fb27SDimitry Andric 
1260*06c3fb27SDimitry Andric //----------Init API for non-rectangular loops--------------------------------
1261*06c3fb27SDimitry Andric 
1262*06c3fb27SDimitry Andric // Init API for collapsed loops (static, no chunks defined).
1263*06c3fb27SDimitry Andric // "bounds_nest" has to be allocated per thread.
1264*06c3fb27SDimitry Andric // API will modify original bounds_nest array to bring it to a canonical form
1265*06c3fb27SDimitry Andric // (only <= and >=, no !=, <, >). If the original loop nest was already in a
1266*06c3fb27SDimitry Andric // canonical form there will be no changes to bounds in bounds_nest array
1267*06c3fb27SDimitry Andric // (only trip counts will be calculated). Internally API will expand the space
1268*06c3fb27SDimitry Andric // to parallelogram/parallelepiped, calculate total, calculate bounds for the
1269*06c3fb27SDimitry Andric // chunks in terms of the new IV, re-calc them in terms of old IVs (especially
1270*06c3fb27SDimitry Andric // important on the left side, to hit the lower bounds and not step over), and
1271*06c3fb27SDimitry Andric // pick the correct chunk for this thread (so it will calculate chunks up to the
1272*06c3fb27SDimitry Andric // needed one). It could be optimized to calculate just this chunk, potentially
1273*06c3fb27SDimitry Andric // a bit less well distributed among threads. It is designed to make sure that
1274*06c3fb27SDimitry Andric // threads will receive predictable chunks, deterministically (so that next nest
1275*06c3fb27SDimitry Andric // of loops with similar characteristics will get exactly same chunks on same
1276*06c3fb27SDimitry Andric // threads).
1277*06c3fb27SDimitry Andric // Current contract: chunk_bounds_nest has only lb0 and ub0,
1278*06c3fb27SDimitry Andric // lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
1279*06c3fb27SDimitry Andric extern "C" kmp_int32
1280*06c3fb27SDimitry Andric __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
1281*06c3fb27SDimitry Andric                           /*in/out*/ bounds_info_t *original_bounds_nest,
1282*06c3fb27SDimitry Andric                           /*out*/ bounds_info_t *chunk_bounds_nest,
1283*06c3fb27SDimitry Andric                           kmp_index_t n, /*out*/ kmp_int32 *plastiter) {
1284*06c3fb27SDimitry Andric 
1285*06c3fb27SDimitry Andric   KMP_DEBUG_ASSERT(plastiter && original_bounds_nest);
1286*06c3fb27SDimitry Andric   KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid));
1287*06c3fb27SDimitry Andric 
1288*06c3fb27SDimitry Andric   if (__kmp_env_consistency_check) {
1289*06c3fb27SDimitry Andric     __kmp_push_workshare(gtid, ct_pdo, loc);
1290*06c3fb27SDimitry Andric   }
1291*06c3fb27SDimitry Andric 
1292*06c3fb27SDimitry Andric   kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
1293*06c3fb27SDimitry Andric 
1294*06c3fb27SDimitry Andric   bounds_info_internal_t *updated_bounds_nest =
1295*06c3fb27SDimitry Andric       (bounds_info_internal_t *)__kmp_allocate(sizeof(bounds_info_internal_t) *
1296*06c3fb27SDimitry Andric                                                n);
1297*06c3fb27SDimitry Andric 
1298*06c3fb27SDimitry Andric   for (kmp_index_t i = 0; i < n; ++i) {
1299*06c3fb27SDimitry Andric     updated_bounds_nest[i].b = original_bounds_nest[i];
1300*06c3fb27SDimitry Andric   }
1301*06c3fb27SDimitry Andric 
1302*06c3fb27SDimitry Andric   kmp_loop_nest_iv_t total =
1303*06c3fb27SDimitry Andric       kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n);
1304*06c3fb27SDimitry Andric 
1305*06c3fb27SDimitry Andric   if (plastiter != NULL) {
1306*06c3fb27SDimitry Andric     *plastiter = FALSE;
1307*06c3fb27SDimitry Andric   }
1308*06c3fb27SDimitry Andric 
1309*06c3fb27SDimitry Andric   if (total == 0) {
1310*06c3fb27SDimitry Andric     // Loop won't execute:
1311*06c3fb27SDimitry Andric     __kmp_free(updated_bounds_nest);
1312*06c3fb27SDimitry Andric     return FALSE;
1313*06c3fb27SDimitry Andric   }
1314*06c3fb27SDimitry Andric 
1315*06c3fb27SDimitry Andric   // OMPTODO: DISTRIBUTE is not supported yet
1316*06c3fb27SDimitry Andric   __kmp_assert_valid_gtid(gtid);
1317*06c3fb27SDimitry Andric   kmp_uint32 tid = __kmp_tid_from_gtid(gtid);
1318*06c3fb27SDimitry Andric 
1319*06c3fb27SDimitry Andric   kmp_info_t *th = __kmp_threads[gtid];
1320*06c3fb27SDimitry Andric   kmp_team_t *team = th->th.th_team;
1321*06c3fb27SDimitry Andric   kmp_uint32 nth = team->t.t_nproc; // Number of threads
1322*06c3fb27SDimitry Andric 
1323*06c3fb27SDimitry Andric   KMP_DEBUG_ASSERT(tid < nth);
1324*06c3fb27SDimitry Andric 
1325*06c3fb27SDimitry Andric   kmp_point_t original_ivs_start =
1326*06c3fb27SDimitry Andric       (kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n);
1327*06c3fb27SDimitry Andric   kmp_point_t original_ivs_end =
1328*06c3fb27SDimitry Andric       (kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n);
1329*06c3fb27SDimitry Andric   kmp_point_t original_ivs_next_start =
1330*06c3fb27SDimitry Andric       (kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n);
1331*06c3fb27SDimitry Andric 
1332*06c3fb27SDimitry Andric   if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
1333*06c3fb27SDimitry Andric                                        /*out*/ original_ivs_start)) {
1334*06c3fb27SDimitry Andric     // Loop won't execute:
1335*06c3fb27SDimitry Andric     __kmp_free(updated_bounds_nest);
1336*06c3fb27SDimitry Andric     __kmp_free(original_ivs_start);
1337*06c3fb27SDimitry Andric     __kmp_free(original_ivs_end);
1338*06c3fb27SDimitry Andric     __kmp_free(original_ivs_next_start);
1339*06c3fb27SDimitry Andric     return FALSE;
1340*06c3fb27SDimitry Andric   }
1341*06c3fb27SDimitry Andric 
1342*06c3fb27SDimitry Andric   // Not doing this optimization for one thread:
1343*06c3fb27SDimitry Andric   // (1) more to test
1344*06c3fb27SDimitry Andric   // (2) without it current contract that chunk_bounds_nest has only lb0 and
1345*06c3fb27SDimitry Andric   // ub0, lb1 and ub1 are set to 0 and can be ignored.
1346*06c3fb27SDimitry Andric   // if (nth == 1) {
1347*06c3fb27SDimitry Andric   //  // One thread:
1348*06c3fb27SDimitry Andric   //  // Copy all info from original_bounds_nest, it'll be good enough.
1349*06c3fb27SDimitry Andric 
1350*06c3fb27SDimitry Andric   //  for (kmp_index_t i = 0; i < n; ++i) {
1351*06c3fb27SDimitry Andric   //    chunk_bounds_nest[i] = original_bounds_nest[i];
1352*06c3fb27SDimitry Andric   //  }
1353*06c3fb27SDimitry Andric 
1354*06c3fb27SDimitry Andric   //  if (plastiter != NULL) {
1355*06c3fb27SDimitry Andric   //    *plastiter = TRUE;
1356*06c3fb27SDimitry Andric   //  }
1357*06c3fb27SDimitry Andric   //  __kmp_free(updated_bounds_nest);
1358*06c3fb27SDimitry Andric   //  __kmp_free(original_ivs_start);
1359*06c3fb27SDimitry Andric   //  __kmp_free(original_ivs_end);
1360*06c3fb27SDimitry Andric   //  __kmp_free(original_ivs_next_start);
1361*06c3fb27SDimitry Andric   //  return TRUE;
1362*06c3fb27SDimitry Andric   //}
1363*06c3fb27SDimitry Andric 
1364*06c3fb27SDimitry Andric   kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs(
1365*06c3fb27SDimitry Andric       updated_bounds_nest, original_ivs_start, n);
1366*06c3fb27SDimitry Andric 
1367*06c3fb27SDimitry Andric   bool last_iter = false;
1368*06c3fb27SDimitry Andric 
1369*06c3fb27SDimitry Andric   for (; nth > 0;) {
1370*06c3fb27SDimitry Andric     // We could calculate chunk size once, but this is to compensate that the
1371*06c3fb27SDimitry Andric     // original space is not parallelepiped and some threads can be left
1372*06c3fb27SDimitry Andric     // without work:
1373*06c3fb27SDimitry Andric     KMP_DEBUG_ASSERT(total >= new_iv);
1374*06c3fb27SDimitry Andric 
1375*06c3fb27SDimitry Andric     kmp_loop_nest_iv_t total_left = total - new_iv;
1376*06c3fb27SDimitry Andric     kmp_loop_nest_iv_t chunk_size = total_left / nth;
1377*06c3fb27SDimitry Andric     kmp_loop_nest_iv_t remainder = total_left % nth;
1378*06c3fb27SDimitry Andric 
1379*06c3fb27SDimitry Andric     kmp_loop_nest_iv_t curr_chunk_size = chunk_size;
1380*06c3fb27SDimitry Andric 
1381*06c3fb27SDimitry Andric     if (remainder > 0) {
1382*06c3fb27SDimitry Andric       ++curr_chunk_size;
1383*06c3fb27SDimitry Andric       --remainder;
1384*06c3fb27SDimitry Andric     }
1385*06c3fb27SDimitry Andric 
1386*06c3fb27SDimitry Andric #if defined(KMP_DEBUG)
1387*06c3fb27SDimitry Andric     kmp_loop_nest_iv_t new_iv_for_start = new_iv;
1388*06c3fb27SDimitry Andric #endif
1389*06c3fb27SDimitry Andric 
1390*06c3fb27SDimitry Andric     if (curr_chunk_size > 1) {
1391*06c3fb27SDimitry Andric       new_iv += curr_chunk_size - 1;
1392*06c3fb27SDimitry Andric     }
1393*06c3fb27SDimitry Andric 
1394*06c3fb27SDimitry Andric     if ((nth == 1) || (new_iv >= total - 1)) {
1395*06c3fb27SDimitry Andric       // Do this one till the end - just in case we miscalculated
1396*06c3fb27SDimitry Andric       // and either too much is left to process or new_iv is a bit too big:
1397*06c3fb27SDimitry Andric       kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1398*06c3fb27SDimitry Andric                                     /*out*/ original_ivs_end);
1399*06c3fb27SDimitry Andric 
1400*06c3fb27SDimitry Andric       last_iter = true;
1401*06c3fb27SDimitry Andric     } else {
1402*06c3fb27SDimitry Andric       // Note: here we make sure it's past (or equal to) the previous point.
1403*06c3fb27SDimitry Andric       if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n,
1404*06c3fb27SDimitry Andric                                                updated_bounds_nest,
1405*06c3fb27SDimitry Andric                                                original_ivs_start, new_iv,
1406*06c3fb27SDimitry Andric                                                /*out*/ original_ivs_end)) {
1407*06c3fb27SDimitry Andric         // We could not find the ending point, use the original upper bounds:
1408*06c3fb27SDimitry Andric         kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1409*06c3fb27SDimitry Andric                                       /*out*/ original_ivs_end);
1410*06c3fb27SDimitry Andric 
1411*06c3fb27SDimitry Andric         last_iter = true;
1412*06c3fb27SDimitry Andric       }
1413*06c3fb27SDimitry Andric     }
1414*06c3fb27SDimitry Andric 
1415*06c3fb27SDimitry Andric #if defined(KMP_DEBUG)
1416*06c3fb27SDimitry Andric     auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs(
1417*06c3fb27SDimitry Andric         updated_bounds_nest, original_ivs_end, n);
1418*06c3fb27SDimitry Andric     KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start);
1419*06c3fb27SDimitry Andric #endif
1420*06c3fb27SDimitry Andric 
1421*06c3fb27SDimitry Andric     if (last_iter && (tid != 0)) {
1422*06c3fb27SDimitry Andric       // We are done, this was last chunk, but no chunk for current thread was
1423*06c3fb27SDimitry Andric       // found:
1424*06c3fb27SDimitry Andric       __kmp_free(updated_bounds_nest);
1425*06c3fb27SDimitry Andric       __kmp_free(original_ivs_start);
1426*06c3fb27SDimitry Andric       __kmp_free(original_ivs_end);
1427*06c3fb27SDimitry Andric       __kmp_free(original_ivs_next_start);
1428*06c3fb27SDimitry Andric       return FALSE;
1429*06c3fb27SDimitry Andric     }
1430*06c3fb27SDimitry Andric 
1431*06c3fb27SDimitry Andric     if (tid == 0) {
1432*06c3fb27SDimitry Andric       // We found the chunk for this thread, now we need to check if it's the
1433*06c3fb27SDimitry Andric       // last chunk or not:
1434*06c3fb27SDimitry Andric 
1435*06c3fb27SDimitry Andric       if (last_iter ||
1436*06c3fb27SDimitry Andric           !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end,
1437*06c3fb27SDimitry Andric                                       /*out*/ original_ivs_next_start)) {
1438*06c3fb27SDimitry Andric         // no more loop iterations left to process,
1439*06c3fb27SDimitry Andric         // this means that currently found chunk is the last chunk:
1440*06c3fb27SDimitry Andric         if (plastiter != NULL) {
1441*06c3fb27SDimitry Andric           *plastiter = TRUE;
1442*06c3fb27SDimitry Andric         }
1443*06c3fb27SDimitry Andric       }
1444*06c3fb27SDimitry Andric 
1445*06c3fb27SDimitry Andric       // Fill in chunk bounds:
1446*06c3fb27SDimitry Andric       for (kmp_index_t i = 0; i < n; ++i) {
1447*06c3fb27SDimitry Andric         chunk_bounds_nest[i] =
1448*06c3fb27SDimitry Andric             original_bounds_nest[i]; // To fill in types, etc. - optional
1449*06c3fb27SDimitry Andric         chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i];
1450*06c3fb27SDimitry Andric         chunk_bounds_nest[i].lb1_u64 = 0;
1451*06c3fb27SDimitry Andric 
1452*06c3fb27SDimitry Andric         chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i];
1453*06c3fb27SDimitry Andric         chunk_bounds_nest[i].ub1_u64 = 0;
1454*06c3fb27SDimitry Andric       }
1455*06c3fb27SDimitry Andric 
1456*06c3fb27SDimitry Andric       __kmp_free(updated_bounds_nest);
1457*06c3fb27SDimitry Andric       __kmp_free(original_ivs_start);
1458*06c3fb27SDimitry Andric       __kmp_free(original_ivs_end);
1459*06c3fb27SDimitry Andric       __kmp_free(original_ivs_next_start);
1460*06c3fb27SDimitry Andric       return TRUE;
1461*06c3fb27SDimitry Andric     }
1462*06c3fb27SDimitry Andric 
1463*06c3fb27SDimitry Andric     --tid;
1464*06c3fb27SDimitry Andric     --nth;
1465*06c3fb27SDimitry Andric 
1466*06c3fb27SDimitry Andric     bool next_chunk = kmp_calc_next_original_ivs(
1467*06c3fb27SDimitry Andric         original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start);
1468*06c3fb27SDimitry Andric     if (!next_chunk) {
1469*06c3fb27SDimitry Andric       // no more loop iterations to process,
1470*06c3fb27SDimitry Andric       // the prevoius chunk was the last chunk
1471*06c3fb27SDimitry Andric       break;
1472*06c3fb27SDimitry Andric     }
1473*06c3fb27SDimitry Andric 
1474*06c3fb27SDimitry Andric     // original_ivs_start is next to previous chunk original_ivs_end,
1475*06c3fb27SDimitry Andric     // we need to start new chunk here, so chunks will be one after another
1476*06c3fb27SDimitry Andric     // without any gap or overlap:
1477*06c3fb27SDimitry Andric     new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest,
1478*06c3fb27SDimitry Andric                                                original_ivs_start, n);
1479*06c3fb27SDimitry Andric   }
1480*06c3fb27SDimitry Andric 
1481*06c3fb27SDimitry Andric   __kmp_free(updated_bounds_nest);
1482*06c3fb27SDimitry Andric   __kmp_free(original_ivs_start);
1483*06c3fb27SDimitry Andric   __kmp_free(original_ivs_end);
1484*06c3fb27SDimitry Andric   __kmp_free(original_ivs_next_start);
1485*06c3fb27SDimitry Andric   return FALSE;
1486*06c3fb27SDimitry Andric }
1487