xref: /llvm-project/openmp/runtime/test/worksharing/for/kmp_sch_simd_guided.c (revision c0e4a0c3a46e16ef066ec11d1d48298b06bef21c)
1 // RUN: %libomp-compile && env LIBOMP_USE_HIDDEN_HELPER_TASK=0 LIBOMP_NUM_HIDDEN_HELPER_THREADS=0 %libomp-run
2 /*
3   Test for the 'schedule(simd:guided)' clause.
4   Compiler needs to generate a dynamic dispatching and pass the schedule
5   value 46 to the OpenMP RTL. Test uses numerous loop parameter combinations.
6 */
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <omp.h>
10 
11 #if defined(WIN32) || defined(_WIN32)
12 #include <windows.h>
13 #define delay() Sleep(1);
14 #else
15 #include <unistd.h>
16 #define delay() usleep(10);
17 #endif
18 
19 // uncomment for debug diagnostics:
20 //#define DEBUG
21 
22 #define SIMD_LEN 4
23 
24 // ---------------------------------------------------------------------------
25 // Various definitions copied from OpenMP RTL
26 enum sched {
27   kmp_sch_static_balanced_chunked = 45,
28   kmp_sch_guided_simd = 46,
29   kmp_sch_runtime_simd = 47,
30 };
31 typedef unsigned u32;
32 typedef long long i64;
33 typedef unsigned long long u64;
34 typedef struct {
35   int reserved_1;
36   int flags;
37   int reserved_2;
38   int reserved_3;
39   char *psource;
40 } id;
41 
42 extern int __kmpc_global_thread_num(id*);
43 extern void __kmpc_barrier(id*, int gtid);
44 extern void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
45 extern void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
46 extern int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
47 extern int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
48 // End of definitions copied from OpenMP RTL.
49 // ---------------------------------------------------------------------------
50 static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};
51 // This variable is defined in OpenMP RTL but we can't have it exposed so we
52 // need to redefine it here.
53 static int __kmp_hidden_helper_threads_num = 0;
54 
55 // ---------------------------------------------------------------------------
run_loop_64(i64 loop_lb,i64 loop_ub,i64 loop_st,int loop_chunk)56 int run_loop_64(i64 loop_lb, i64 loop_ub, i64 loop_st, int loop_chunk) {
57   int err = 0;
58   static int volatile loop_sync = 0;
59   i64 lb;   // Chunk lower bound
60   i64 ub;   // Chunk upper bound
61   i64 st;   // Chunk stride
62   int rc;
63   int tid = omp_get_thread_num();
64   int gtid = tid;
65   if (gtid) {
66     gtid += __kmp_hidden_helper_threads_num;
67   }
68   int last;
69 #if DEBUG
70   printf("run_loop_<%d>(lb=%d, ub=%d, st=%d, ch=%d)\n",
71     (int)sizeof(i64), gtid, tid,
72     (int)loop_lb, (int)loop_ub, (int)loop_st, loop_chunk);
73 #endif
74   // Don't test degenerate cases that should have been discovered by codegen
75   if (loop_st == 0)
76     return 0;
77   if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
78     return 0;
79 
80   __kmpc_dispatch_init_8(&loc, gtid, kmp_sch_guided_simd,
81                          loop_lb, loop_ub, loop_st, loop_chunk);
82   if (tid == 0) {
83     // Let the master thread handle the chunks alone
84     int chunk;      // No of current chunk
85     i64 next_lb;    // Lower bound of the next chunk
86     i64 last_ub;    // Upper bound of the last processed chunk
87     u64 cur;        // Number of interations in  current chunk
88     u64 max;        // Max allowed iterations for current chunk
89     int undersized = 0;
90 
91     chunk = 0;
92     next_lb = loop_lb;
93     max = (loop_ub - loop_lb) / loop_st + 1;
94     // The first chunk can consume all iterations
95     while (__kmpc_dispatch_next_8(&loc, gtid, &last, &lb, &ub, &st)) {
96       ++ chunk;
97 #if DEBUG
98       printf("chunk=%d, lb=%d, ub=%d\n", chunk, (int)lb, (int)ub);
99 #endif
100       // Check if previous chunk (it is not the final chunk) is undersized
101       if (undersized) {
102         printf("Error with chunk %d\n", chunk);
103         err++;
104       }
105       // Check lower and upper bounds
106       if (lb != next_lb) {
107         printf("Error with lb %d, %d, ch %d\n", (int)lb, (int)next_lb, chunk);
108         err++;
109       }
110       if (loop_st > 0) {
111         if (!(ub <= loop_ub)) {
112           printf("Error with ub %d, %d, ch %d\n", (int)ub, (int)loop_ub, chunk);
113           err++;
114         }
115         if (!(lb <= ub)) {
116           printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
117           err++;
118         }
119       } else {
120         if (!(ub >= loop_ub)) {
121           printf("Error with ub %d, %d, %d\n", (int)ub, (int)loop_ub, chunk);
122           err++;
123         }
124         if (!(lb >= ub)) {
125           printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
126           err++;
127         }
128       }; // if
129       // Stride should not change
130       if (!(st == loop_st)) {
131         printf("Error with st %d, %d, ch %d\n", (int)st, (int)loop_st, chunk);
132         err++;
133       }
134       cur = (ub - lb) / loop_st + 1;
135       // Guided scheduling uses FP computations, so current chunk may
136       // be a bit bigger (+1) than allowed maximum
137       if (!(cur <= max + 1)) {
138         printf("Error with iter %llu, %llu\n", cur, max);
139         err++;
140       }
141       // Update maximum for the next chunk
142       if (cur < max)
143         max = cur;
144       next_lb = ub + loop_st;
145       last_ub = ub;
146       undersized = (cur < loop_chunk);
147     }; // while
148     // Must have at least one chunk
149     if (!(chunk > 0)) {
150       printf("Error with chunk %d\n", chunk);
151       err++;
152     }
153     // Must have the right last iteration index
154     if (loop_st > 0) {
155       if (!(last_ub <= loop_ub)) {
156         printf("Error with last1 %d, %d, ch %d\n",
157                (int)last_ub, (int)loop_ub, chunk);
158         err++;
159       }
160       if (!(last_ub + loop_st > loop_ub)) {
161         printf("Error with last2 %d, %d, %d, ch %d\n",
162                (int)last_ub, (int)loop_st, (int)loop_ub, chunk);
163         err++;
164       }
165     } else {
166       if (!(last_ub >= loop_ub)) {
167         printf("Error with last1 %d, %d, ch %d\n",
168                (int)last_ub, (int)loop_ub, chunk);
169         err++;
170       }
171       if (!(last_ub + loop_st < loop_ub)) {
172         printf("Error with last2 %d, %d, %d, ch %d\n",
173                (int)last_ub, (int)loop_st, (int)loop_ub, chunk);
174         err++;
175       }
176     }; // if
177     // Let non-master threads go
178     loop_sync = 1;
179   } else {
180     int i;
181     // Workers wait for master thread to finish, then call __kmpc_dispatch_next
182     for (i = 0; i < 1000000; ++ i) {
183       if (loop_sync != 0) {
184         break;
185       }; // if
186     }; // for i
187     while (loop_sync == 0) {
188       delay();
189     }; // while
190     // At this moment we do not have any more chunks -- all the chunks already
191     // processed by master thread
192     rc = __kmpc_dispatch_next_8(&loc, gtid, &last, &lb, &ub, &st);
193     if (rc) {
194       printf("Error return value\n");
195       err++;
196     }
197   }; // if
198 
199   __kmpc_barrier(&loc, gtid);
200   if (tid == 0) {
201       loop_sync = 0;    // Restore original state
202 #if DEBUG
203       printf("run_loop_64(): at the end\n");
204 #endif
205   }; // if
206   __kmpc_barrier(&loc, gtid);
207   return err;
208 } // run_loop
209 
210 // ---------------------------------------------------------------------------
run_loop_32(int loop_lb,int loop_ub,int loop_st,int loop_chunk)211 int run_loop_32(int loop_lb, int loop_ub, int loop_st, int loop_chunk) {
212   int err = 0;
213   static int volatile loop_sync = 0;
214   int lb;   // Chunk lower bound
215   int ub;   // Chunk upper bound
216   int st;   // Chunk stride
217   int rc;
218   int tid = omp_get_thread_num();
219   int gtid = tid;
220   if (gtid) {
221     gtid += __kmp_hidden_helper_threads_num;
222   }
223   int last;
224 #if DEBUG
225   printf("run_loop_<%d>(lb=%d, ub=%d, st=%d, ch=%d)\n",
226     (int)sizeof(int), gtid, tid,
227     (int)loop_lb, (int)loop_ub, (int)loop_st, loop_chunk);
228 #endif
229   // Don't test degenerate cases that should have been discovered by codegen
230   if (loop_st == 0)
231     return 0;
232   if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
233     return 0;
234 
235   __kmpc_dispatch_init_4(&loc, gtid, kmp_sch_guided_simd,
236                          loop_lb, loop_ub, loop_st, loop_chunk);
237   if (tid == 0) {
238     // Let the master thread handle the chunks alone
239     int chunk;      // No of current chunk
240     int next_lb;    // Lower bound of the next chunk
241     int last_ub;    // Upper bound of the last processed chunk
242     u64 cur;        // Number of interations in  current chunk
243     u64 max;        // Max allowed iterations for current chunk
244     int undersized = 0;
245 
246     chunk = 0;
247     next_lb = loop_lb;
248     max = (loop_ub - loop_lb) / loop_st + 1;
249     // The first chunk can consume all iterations
250     while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
251       ++ chunk;
252 #if DEBUG
253       printf("chunk=%d, lb=%d, ub=%d\n", chunk, (int)lb, (int)ub);
254 #endif
255       // Check if previous chunk (it is not the final chunk) is undersized
256       if (undersized) {
257         printf("Error with chunk %d\n", chunk);
258         err++;
259       }
260       // Check lower and upper bounds
261       if (lb != next_lb) {
262         printf("Error with lb %d, %d, ch %d\n", (int)lb, (int)next_lb, chunk);
263         err++;
264       }
265       if (loop_st > 0) {
266         if (!(ub <= loop_ub)) {
267           printf("Error with ub %d, %d, ch %d\n", (int)ub, (int)loop_ub, chunk);
268           err++;
269         }
270         if (!(lb <= ub)) {
271           printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
272           err++;
273         }
274       } else {
275         if (!(ub >= loop_ub)) {
276           printf("Error with ub %d, %d, %d\n", (int)ub, (int)loop_ub, chunk);
277           err++;
278         }
279         if (!(lb >= ub)) {
280           printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
281           err++;
282         }
283       }; // if
284       // Stride should not change
285       if (!(st == loop_st)) {
286         printf("Error with st %d, %d, ch %d\n", (int)st, (int)loop_st, chunk);
287         err++;
288       }
289       cur = (ub - lb) / loop_st + 1;
290       // Guided scheduling uses FP computations, so current chunk may
291       // be a bit bigger (+1) than allowed maximum
292       if (!(cur <= max + 1)) {
293         printf("Error with iter %llu, %llu\n", cur, max);
294         err++;
295       }
296       // Update maximum for the next chunk
297       if (cur < max)
298         max = cur;
299       next_lb = ub + loop_st;
300       last_ub = ub;
301       undersized = (cur < loop_chunk);
302     }; // while
303     // Must have at least one chunk
304     if (!(chunk > 0)) {
305       printf("Error with chunk %d\n", chunk);
306       err++;
307     }
308     // Must have the right last iteration index
309     if (loop_st > 0) {
310       if (!(last_ub <= loop_ub)) {
311         printf("Error with last1 %d, %d, ch %d\n",
312                (int)last_ub, (int)loop_ub, chunk);
313         err++;
314       }
315       if (!(last_ub + loop_st > loop_ub)) {
316         printf("Error with last2 %d, %d, %d, ch %d\n",
317                (int)last_ub, (int)loop_st, (int)loop_ub, chunk);
318         err++;
319       }
320     } else {
321       if (!(last_ub >= loop_ub)) {
322         printf("Error with last1 %d, %d, ch %d\n",
323                (int)last_ub, (int)loop_ub, chunk);
324         err++;
325       }
326       if (!(last_ub + loop_st < loop_ub)) {
327         printf("Error with last2 %d, %d, %d, ch %d\n",
328                (int)last_ub, (int)loop_st, (int)loop_ub, chunk);
329         err++;
330       }
331     }; // if
332     // Let non-master threads go
333     loop_sync = 1;
334   } else {
335     int i;
336     // Workers wait for master thread to finish, then call __kmpc_dispatch_next
337     for (i = 0; i < 1000000; ++ i) {
338       if (loop_sync != 0) {
339         break;
340       }; // if
341     }; // for i
342     while (loop_sync == 0) {
343       delay();
344     }; // while
345     // At this moment we do not have any more chunks -- all the chunks already
346     // processed by the master thread
347     rc = __kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st);
348     if (rc) {
349       printf("Error return value\n");
350       err++;
351     }
352   }; // if
353 
354   __kmpc_barrier(&loc, gtid);
355   if (tid == 0) {
356       loop_sync = 0;    // Restore original state
357 #if DEBUG
358       printf("run_loop<>(): at the end\n");
359 #endif
360   }; // if
361   __kmpc_barrier(&loc, gtid);
362   return err;
363 } // run_loop
364 
365 // ---------------------------------------------------------------------------
run_64(int num_th)366 int run_64(int num_th)
367 {
368  int err = 0;
369 #pragma omp parallel num_threads(num_th)
370  {
371   int chunk;
372   i64 st, lb, ub;
373   for (chunk = SIMD_LEN; chunk <= 3*SIMD_LEN; chunk += SIMD_LEN) {
374     for (st = 1; st <= 3; ++ st) {
375       for (lb = -3 * num_th * st; lb <= 3 * num_th * st; ++ lb) {
376         for (ub = lb; ub < lb + num_th * (chunk+1) * st; ++ ub) {
377           err += run_loop_64(lb, ub,  st, chunk);
378           err += run_loop_64(ub, lb, -st, chunk);
379         }; // for ub
380       }; // for lb
381     }; // for st
382   }; // for chunk
383  }
384  return err;
385 } // run_all
386 
run_32(int num_th)387 int run_32(int num_th)
388 {
389  int err = 0;
390 #pragma omp parallel num_threads(num_th)
391  {
392   int chunk, st, lb, ub;
393   for (chunk = SIMD_LEN; chunk <= 3*SIMD_LEN; chunk += SIMD_LEN) {
394     for (st = 1; st <= 3; ++ st) {
395       for (lb = -3 * num_th * st; lb <= 3 * num_th * st; ++ lb) {
396         for (ub = lb; ub < lb + num_th * (chunk+1) * st; ++ ub) {
397           err += run_loop_32(lb, ub,  st, chunk);
398           err += run_loop_32(ub, lb, -st, chunk);
399         }; // for ub
400       }; // for lb
401     }; // for st
402   }; // for chunk
403  }
404  return err;
405 } // run_all
406 
407 // ---------------------------------------------------------------------------
main()408 int main()
409 {
410   {
411     const char *env = getenv("LIBOMP_NUM_HIDDEN_HELPER_THREADS");
412     if (env) {
413       __kmp_hidden_helper_threads_num = atoi(env);
414     }
415   }
416 
417   int n, err = 0;
418   for (n = 1; n <= 4; ++ n) {
419     err += run_32(n);
420     err += run_64(n);
421   }; // for n
422   if (err)
423     printf("failed with %d errors\n", err);
424   else
425     printf("passed\n");
426   return err;
427 }
428