1*38fd1498Szrj /* Copyright (C) 2005-2018 Free Software Foundation, Inc.
2*38fd1498Szrj Contributed by Richard Henderson <rth@redhat.com>.
3*38fd1498Szrj
4*38fd1498Szrj This file is part of the GNU Offloading and Multi Processing Library
5*38fd1498Szrj (libgomp).
6*38fd1498Szrj
7*38fd1498Szrj Libgomp is free software; you can redistribute it and/or modify it
8*38fd1498Szrj under the terms of the GNU General Public License as published by
9*38fd1498Szrj the Free Software Foundation; either version 3, or (at your option)
10*38fd1498Szrj any later version.
11*38fd1498Szrj
12*38fd1498Szrj Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13*38fd1498Szrj WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14*38fd1498Szrj FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15*38fd1498Szrj more details.
16*38fd1498Szrj
17*38fd1498Szrj Under Section 7 of GPL version 3, you are granted additional
18*38fd1498Szrj permissions described in the GCC Runtime Library Exception, version
19*38fd1498Szrj 3.1, as published by the Free Software Foundation.
20*38fd1498Szrj
21*38fd1498Szrj You should have received a copy of the GNU General Public License and
22*38fd1498Szrj a copy of the GCC Runtime Library Exception along with this program;
23*38fd1498Szrj see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24*38fd1498Szrj <http://www.gnu.org/licenses/>. */
25*38fd1498Szrj
26*38fd1498Szrj /* This file contains routines for managing work-share iteration, both
27*38fd1498Szrj for loops and sections. */
28*38fd1498Szrj
29*38fd1498Szrj #include "libgomp.h"
30*38fd1498Szrj #include <stdlib.h>
31*38fd1498Szrj
32*38fd1498Szrj typedef unsigned long long gomp_ull;
33*38fd1498Szrj
34*38fd1498Szrj /* This function implements the STATIC scheduling method. The caller should
35*38fd1498Szrj iterate *pstart <= x < *pend. Return zero if there are more iterations
36*38fd1498Szrj to perform; nonzero if not. Return less than 0 if this thread had
37*38fd1498Szrj received the absolutely last iteration. */
38*38fd1498Szrj
39*38fd1498Szrj int
gomp_iter_ull_static_next(gomp_ull * pstart,gomp_ull * pend)40*38fd1498Szrj gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
41*38fd1498Szrj {
42*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
43*38fd1498Szrj struct gomp_team *team = thr->ts.team;
44*38fd1498Szrj struct gomp_work_share *ws = thr->ts.work_share;
45*38fd1498Szrj unsigned long nthreads = team ? team->nthreads : 1;
46*38fd1498Szrj
47*38fd1498Szrj if (thr->ts.static_trip == -1)
48*38fd1498Szrj return -1;
49*38fd1498Szrj
50*38fd1498Szrj /* Quick test for degenerate teams and orphaned constructs. */
51*38fd1498Szrj if (nthreads == 1)
52*38fd1498Szrj {
53*38fd1498Szrj *pstart = ws->next_ull;
54*38fd1498Szrj *pend = ws->end_ull;
55*38fd1498Szrj thr->ts.static_trip = -1;
56*38fd1498Szrj return ws->next_ull == ws->end_ull;
57*38fd1498Szrj }
58*38fd1498Szrj
59*38fd1498Szrj /* We interpret chunk_size zero as "unspecified", which means that we
60*38fd1498Szrj should break up the iterations such that each thread makes only one
61*38fd1498Szrj trip through the outer loop. */
62*38fd1498Szrj if (ws->chunk_size_ull == 0)
63*38fd1498Szrj {
64*38fd1498Szrj gomp_ull n, q, i, t, s0, e0, s, e;
65*38fd1498Szrj
66*38fd1498Szrj if (thr->ts.static_trip > 0)
67*38fd1498Szrj return 1;
68*38fd1498Szrj
69*38fd1498Szrj /* Compute the total number of iterations. */
70*38fd1498Szrj if (__builtin_expect (ws->mode, 0) == 0)
71*38fd1498Szrj n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
72*38fd1498Szrj else
73*38fd1498Szrj n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
74*38fd1498Szrj i = thr->ts.team_id;
75*38fd1498Szrj
76*38fd1498Szrj /* Compute the "zero-based" start and end points. That is, as
77*38fd1498Szrj if the loop began at zero and incremented by one. */
78*38fd1498Szrj q = n / nthreads;
79*38fd1498Szrj t = n % nthreads;
80*38fd1498Szrj if (i < t)
81*38fd1498Szrj {
82*38fd1498Szrj t = 0;
83*38fd1498Szrj q++;
84*38fd1498Szrj }
85*38fd1498Szrj s0 = q * i + t;
86*38fd1498Szrj e0 = s0 + q;
87*38fd1498Szrj
88*38fd1498Szrj /* Notice when no iterations allocated for this thread. */
89*38fd1498Szrj if (s0 >= e0)
90*38fd1498Szrj {
91*38fd1498Szrj thr->ts.static_trip = 1;
92*38fd1498Szrj return 1;
93*38fd1498Szrj }
94*38fd1498Szrj
95*38fd1498Szrj /* Transform these to the actual start and end numbers. */
96*38fd1498Szrj s = s0 * ws->incr_ull + ws->next_ull;
97*38fd1498Szrj e = e0 * ws->incr_ull + ws->next_ull;
98*38fd1498Szrj
99*38fd1498Szrj *pstart = s;
100*38fd1498Szrj *pend = e;
101*38fd1498Szrj thr->ts.static_trip = (e0 == n ? -1 : 1);
102*38fd1498Szrj return 0;
103*38fd1498Szrj }
104*38fd1498Szrj else
105*38fd1498Szrj {
106*38fd1498Szrj gomp_ull n, s0, e0, i, c, s, e;
107*38fd1498Szrj
108*38fd1498Szrj /* Otherwise, each thread gets exactly chunk_size iterations
109*38fd1498Szrj (if available) each time through the loop. */
110*38fd1498Szrj
111*38fd1498Szrj if (__builtin_expect (ws->mode, 0) == 0)
112*38fd1498Szrj n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
113*38fd1498Szrj else
114*38fd1498Szrj n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
115*38fd1498Szrj i = thr->ts.team_id;
116*38fd1498Szrj c = ws->chunk_size_ull;
117*38fd1498Szrj
118*38fd1498Szrj /* Initial guess is a C sized chunk positioned nthreads iterations
119*38fd1498Szrj in, offset by our thread number. */
120*38fd1498Szrj s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
121*38fd1498Szrj e0 = s0 + c;
122*38fd1498Szrj
123*38fd1498Szrj /* Detect overflow. */
124*38fd1498Szrj if (s0 >= n)
125*38fd1498Szrj return 1;
126*38fd1498Szrj if (e0 > n)
127*38fd1498Szrj e0 = n;
128*38fd1498Szrj
129*38fd1498Szrj /* Transform these to the actual start and end numbers. */
130*38fd1498Szrj s = s0 * ws->incr_ull + ws->next_ull;
131*38fd1498Szrj e = e0 * ws->incr_ull + ws->next_ull;
132*38fd1498Szrj
133*38fd1498Szrj *pstart = s;
134*38fd1498Szrj *pend = e;
135*38fd1498Szrj
136*38fd1498Szrj if (e0 == n)
137*38fd1498Szrj thr->ts.static_trip = -1;
138*38fd1498Szrj else
139*38fd1498Szrj thr->ts.static_trip++;
140*38fd1498Szrj return 0;
141*38fd1498Szrj }
142*38fd1498Szrj }
143*38fd1498Szrj
144*38fd1498Szrj
145*38fd1498Szrj /* This function implements the DYNAMIC scheduling method. Arguments are
146*38fd1498Szrj as for gomp_iter_ull_static_next. This function must be called with
147*38fd1498Szrj ws->lock held. */
148*38fd1498Szrj
149*38fd1498Szrj bool
gomp_iter_ull_dynamic_next_locked(gomp_ull * pstart,gomp_ull * pend)150*38fd1498Szrj gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
151*38fd1498Szrj {
152*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
153*38fd1498Szrj struct gomp_work_share *ws = thr->ts.work_share;
154*38fd1498Szrj gomp_ull start, end, chunk, left;
155*38fd1498Szrj
156*38fd1498Szrj start = ws->next_ull;
157*38fd1498Szrj if (start == ws->end_ull)
158*38fd1498Szrj return false;
159*38fd1498Szrj
160*38fd1498Szrj chunk = ws->chunk_size_ull;
161*38fd1498Szrj left = ws->end_ull - start;
162*38fd1498Szrj if (__builtin_expect (ws->mode & 2, 0))
163*38fd1498Szrj {
164*38fd1498Szrj if (chunk < left)
165*38fd1498Szrj chunk = left;
166*38fd1498Szrj }
167*38fd1498Szrj else
168*38fd1498Szrj {
169*38fd1498Szrj if (chunk > left)
170*38fd1498Szrj chunk = left;
171*38fd1498Szrj }
172*38fd1498Szrj end = start + chunk;
173*38fd1498Szrj
174*38fd1498Szrj ws->next_ull = end;
175*38fd1498Szrj *pstart = start;
176*38fd1498Szrj *pend = end;
177*38fd1498Szrj return true;
178*38fd1498Szrj }
179*38fd1498Szrj
180*38fd1498Szrj
181*38fd1498Szrj #if defined HAVE_SYNC_BUILTINS && defined __LP64__
182*38fd1498Szrj /* Similar, but doesn't require the lock held, and uses compare-and-swap
183*38fd1498Szrj instead. Note that the only memory value that changes is ws->next_ull. */
184*38fd1498Szrj
185*38fd1498Szrj bool
gomp_iter_ull_dynamic_next(gomp_ull * pstart,gomp_ull * pend)186*38fd1498Szrj gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
187*38fd1498Szrj {
188*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
189*38fd1498Szrj struct gomp_work_share *ws = thr->ts.work_share;
190*38fd1498Szrj gomp_ull start, end, nend, chunk;
191*38fd1498Szrj
192*38fd1498Szrj end = ws->end_ull;
193*38fd1498Szrj chunk = ws->chunk_size_ull;
194*38fd1498Szrj
195*38fd1498Szrj if (__builtin_expect (ws->mode & 1, 1))
196*38fd1498Szrj {
197*38fd1498Szrj gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
198*38fd1498Szrj if (__builtin_expect (ws->mode & 2, 0) == 0)
199*38fd1498Szrj {
200*38fd1498Szrj if (tmp >= end)
201*38fd1498Szrj return false;
202*38fd1498Szrj nend = tmp + chunk;
203*38fd1498Szrj if (nend > end)
204*38fd1498Szrj nend = end;
205*38fd1498Szrj *pstart = tmp;
206*38fd1498Szrj *pend = nend;
207*38fd1498Szrj return true;
208*38fd1498Szrj }
209*38fd1498Szrj else
210*38fd1498Szrj {
211*38fd1498Szrj if (tmp <= end)
212*38fd1498Szrj return false;
213*38fd1498Szrj nend = tmp + chunk;
214*38fd1498Szrj if (nend < end)
215*38fd1498Szrj nend = end;
216*38fd1498Szrj *pstart = tmp;
217*38fd1498Szrj *pend = nend;
218*38fd1498Szrj return true;
219*38fd1498Szrj }
220*38fd1498Szrj }
221*38fd1498Szrj
222*38fd1498Szrj start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
223*38fd1498Szrj while (1)
224*38fd1498Szrj {
225*38fd1498Szrj gomp_ull left = end - start;
226*38fd1498Szrj gomp_ull tmp;
227*38fd1498Szrj
228*38fd1498Szrj if (start == end)
229*38fd1498Szrj return false;
230*38fd1498Szrj
231*38fd1498Szrj if (__builtin_expect (ws->mode & 2, 0))
232*38fd1498Szrj {
233*38fd1498Szrj if (chunk < left)
234*38fd1498Szrj chunk = left;
235*38fd1498Szrj }
236*38fd1498Szrj else
237*38fd1498Szrj {
238*38fd1498Szrj if (chunk > left)
239*38fd1498Szrj chunk = left;
240*38fd1498Szrj }
241*38fd1498Szrj nend = start + chunk;
242*38fd1498Szrj
243*38fd1498Szrj tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
244*38fd1498Szrj if (__builtin_expect (tmp == start, 1))
245*38fd1498Szrj break;
246*38fd1498Szrj
247*38fd1498Szrj start = tmp;
248*38fd1498Szrj }
249*38fd1498Szrj
250*38fd1498Szrj *pstart = start;
251*38fd1498Szrj *pend = nend;
252*38fd1498Szrj return true;
253*38fd1498Szrj }
254*38fd1498Szrj #endif /* HAVE_SYNC_BUILTINS */
255*38fd1498Szrj
256*38fd1498Szrj
257*38fd1498Szrj /* This function implements the GUIDED scheduling method. Arguments are
258*38fd1498Szrj as for gomp_iter_ull_static_next. This function must be called with the
259*38fd1498Szrj work share lock held. */
260*38fd1498Szrj
261*38fd1498Szrj bool
gomp_iter_ull_guided_next_locked(gomp_ull * pstart,gomp_ull * pend)262*38fd1498Szrj gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
263*38fd1498Szrj {
264*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
265*38fd1498Szrj struct gomp_work_share *ws = thr->ts.work_share;
266*38fd1498Szrj struct gomp_team *team = thr->ts.team;
267*38fd1498Szrj gomp_ull nthreads = team ? team->nthreads : 1;
268*38fd1498Szrj gomp_ull n, q;
269*38fd1498Szrj gomp_ull start, end;
270*38fd1498Szrj
271*38fd1498Szrj if (ws->next_ull == ws->end_ull)
272*38fd1498Szrj return false;
273*38fd1498Szrj
274*38fd1498Szrj start = ws->next_ull;
275*38fd1498Szrj if (__builtin_expect (ws->mode, 0) == 0)
276*38fd1498Szrj n = (ws->end_ull - start) / ws->incr_ull;
277*38fd1498Szrj else
278*38fd1498Szrj n = (start - ws->end_ull) / -ws->incr_ull;
279*38fd1498Szrj q = (n + nthreads - 1) / nthreads;
280*38fd1498Szrj
281*38fd1498Szrj if (q < ws->chunk_size_ull)
282*38fd1498Szrj q = ws->chunk_size_ull;
283*38fd1498Szrj if (q <= n)
284*38fd1498Szrj end = start + q * ws->incr_ull;
285*38fd1498Szrj else
286*38fd1498Szrj end = ws->end_ull;
287*38fd1498Szrj
288*38fd1498Szrj ws->next_ull = end;
289*38fd1498Szrj *pstart = start;
290*38fd1498Szrj *pend = end;
291*38fd1498Szrj return true;
292*38fd1498Szrj }
293*38fd1498Szrj
294*38fd1498Szrj #if defined HAVE_SYNC_BUILTINS && defined __LP64__
295*38fd1498Szrj /* Similar, but doesn't require the lock held, and uses compare-and-swap
296*38fd1498Szrj instead. Note that the only memory value that changes is ws->next_ull. */
297*38fd1498Szrj
298*38fd1498Szrj bool
gomp_iter_ull_guided_next(gomp_ull * pstart,gomp_ull * pend)299*38fd1498Szrj gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
300*38fd1498Szrj {
301*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
302*38fd1498Szrj struct gomp_work_share *ws = thr->ts.work_share;
303*38fd1498Szrj struct gomp_team *team = thr->ts.team;
304*38fd1498Szrj gomp_ull nthreads = team ? team->nthreads : 1;
305*38fd1498Szrj gomp_ull start, end, nend, incr;
306*38fd1498Szrj gomp_ull chunk_size;
307*38fd1498Szrj
308*38fd1498Szrj start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
309*38fd1498Szrj end = ws->end_ull;
310*38fd1498Szrj incr = ws->incr_ull;
311*38fd1498Szrj chunk_size = ws->chunk_size_ull;
312*38fd1498Szrj
313*38fd1498Szrj while (1)
314*38fd1498Szrj {
315*38fd1498Szrj gomp_ull n, q;
316*38fd1498Szrj gomp_ull tmp;
317*38fd1498Szrj
318*38fd1498Szrj if (start == end)
319*38fd1498Szrj return false;
320*38fd1498Szrj
321*38fd1498Szrj if (__builtin_expect (ws->mode, 0) == 0)
322*38fd1498Szrj n = (end - start) / incr;
323*38fd1498Szrj else
324*38fd1498Szrj n = (start - end) / -incr;
325*38fd1498Szrj q = (n + nthreads - 1) / nthreads;
326*38fd1498Szrj
327*38fd1498Szrj if (q < chunk_size)
328*38fd1498Szrj q = chunk_size;
329*38fd1498Szrj if (__builtin_expect (q <= n, 1))
330*38fd1498Szrj nend = start + q * incr;
331*38fd1498Szrj else
332*38fd1498Szrj nend = end;
333*38fd1498Szrj
334*38fd1498Szrj tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
335*38fd1498Szrj if (__builtin_expect (tmp == start, 1))
336*38fd1498Szrj break;
337*38fd1498Szrj
338*38fd1498Szrj start = tmp;
339*38fd1498Szrj }
340*38fd1498Szrj
341*38fd1498Szrj *pstart = start;
342*38fd1498Szrj *pend = nend;
343*38fd1498Szrj return true;
344*38fd1498Szrj }
345*38fd1498Szrj #endif /* HAVE_SYNC_BUILTINS */
346