1*38fd1498Szrj /* Copyright (C) 2005-2018 Free Software Foundation, Inc.
2*38fd1498Szrj Contributed by Richard Henderson <rth@redhat.com>.
3*38fd1498Szrj
4*38fd1498Szrj This file is part of the GNU Offloading and Multi Processing Library
5*38fd1498Szrj (libgomp).
6*38fd1498Szrj
7*38fd1498Szrj Libgomp is free software; you can redistribute it and/or modify it
8*38fd1498Szrj under the terms of the GNU General Public License as published by
9*38fd1498Szrj the Free Software Foundation; either version 3, or (at your option)
10*38fd1498Szrj any later version.
11*38fd1498Szrj
12*38fd1498Szrj Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13*38fd1498Szrj WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14*38fd1498Szrj FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15*38fd1498Szrj more details.
16*38fd1498Szrj
17*38fd1498Szrj Under Section 7 of GPL version 3, you are granted additional
18*38fd1498Szrj permissions described in the GCC Runtime Library Exception, version
19*38fd1498Szrj 3.1, as published by the Free Software Foundation.
20*38fd1498Szrj
21*38fd1498Szrj You should have received a copy of the GNU General Public License and
22*38fd1498Szrj a copy of the GCC Runtime Library Exception along with this program;
23*38fd1498Szrj see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24*38fd1498Szrj <http://www.gnu.org/licenses/>. */
25*38fd1498Szrj
26*38fd1498Szrj /* This file contains routines for managing work-share iteration, both
27*38fd1498Szrj for loops and sections. */
28*38fd1498Szrj
29*38fd1498Szrj #include "libgomp.h"
30*38fd1498Szrj #include <stdlib.h>
31*38fd1498Szrj
32*38fd1498Szrj
33*38fd1498Szrj /* This function implements the STATIC scheduling method. The caller should
34*38fd1498Szrj iterate *pstart <= x < *pend. Return zero if there are more iterations
35*38fd1498Szrj to perform; nonzero if not. Return less than 0 if this thread had
36*38fd1498Szrj received the absolutely last iteration. */
37*38fd1498Szrj
38*38fd1498Szrj int
gomp_iter_static_next(long * pstart,long * pend)39*38fd1498Szrj gomp_iter_static_next (long *pstart, long *pend)
40*38fd1498Szrj {
41*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
42*38fd1498Szrj struct gomp_team *team = thr->ts.team;
43*38fd1498Szrj struct gomp_work_share *ws = thr->ts.work_share;
44*38fd1498Szrj unsigned long nthreads = team ? team->nthreads : 1;
45*38fd1498Szrj
46*38fd1498Szrj if (thr->ts.static_trip == -1)
47*38fd1498Szrj return -1;
48*38fd1498Szrj
49*38fd1498Szrj /* Quick test for degenerate teams and orphaned constructs. */
50*38fd1498Szrj if (nthreads == 1)
51*38fd1498Szrj {
52*38fd1498Szrj *pstart = ws->next;
53*38fd1498Szrj *pend = ws->end;
54*38fd1498Szrj thr->ts.static_trip = -1;
55*38fd1498Szrj return ws->next == ws->end;
56*38fd1498Szrj }
57*38fd1498Szrj
58*38fd1498Szrj /* We interpret chunk_size zero as "unspecified", which means that we
59*38fd1498Szrj should break up the iterations such that each thread makes only one
60*38fd1498Szrj trip through the outer loop. */
61*38fd1498Szrj if (ws->chunk_size == 0)
62*38fd1498Szrj {
63*38fd1498Szrj unsigned long n, q, i, t;
64*38fd1498Szrj unsigned long s0, e0;
65*38fd1498Szrj long s, e;
66*38fd1498Szrj
67*38fd1498Szrj if (thr->ts.static_trip > 0)
68*38fd1498Szrj return 1;
69*38fd1498Szrj
70*38fd1498Szrj /* Compute the total number of iterations. */
71*38fd1498Szrj s = ws->incr + (ws->incr > 0 ? -1 : 1);
72*38fd1498Szrj n = (ws->end - ws->next + s) / ws->incr;
73*38fd1498Szrj i = thr->ts.team_id;
74*38fd1498Szrj
75*38fd1498Szrj /* Compute the "zero-based" start and end points. That is, as
76*38fd1498Szrj if the loop began at zero and incremented by one. */
77*38fd1498Szrj q = n / nthreads;
78*38fd1498Szrj t = n % nthreads;
79*38fd1498Szrj if (i < t)
80*38fd1498Szrj {
81*38fd1498Szrj t = 0;
82*38fd1498Szrj q++;
83*38fd1498Szrj }
84*38fd1498Szrj s0 = q * i + t;
85*38fd1498Szrj e0 = s0 + q;
86*38fd1498Szrj
87*38fd1498Szrj /* Notice when no iterations allocated for this thread. */
88*38fd1498Szrj if (s0 >= e0)
89*38fd1498Szrj {
90*38fd1498Szrj thr->ts.static_trip = 1;
91*38fd1498Szrj return 1;
92*38fd1498Szrj }
93*38fd1498Szrj
94*38fd1498Szrj /* Transform these to the actual start and end numbers. */
95*38fd1498Szrj s = (long)s0 * ws->incr + ws->next;
96*38fd1498Szrj e = (long)e0 * ws->incr + ws->next;
97*38fd1498Szrj
98*38fd1498Szrj *pstart = s;
99*38fd1498Szrj *pend = e;
100*38fd1498Szrj thr->ts.static_trip = (e0 == n ? -1 : 1);
101*38fd1498Szrj return 0;
102*38fd1498Szrj }
103*38fd1498Szrj else
104*38fd1498Szrj {
105*38fd1498Szrj unsigned long n, s0, e0, i, c;
106*38fd1498Szrj long s, e;
107*38fd1498Szrj
108*38fd1498Szrj /* Otherwise, each thread gets exactly chunk_size iterations
109*38fd1498Szrj (if available) each time through the loop. */
110*38fd1498Szrj
111*38fd1498Szrj s = ws->incr + (ws->incr > 0 ? -1 : 1);
112*38fd1498Szrj n = (ws->end - ws->next + s) / ws->incr;
113*38fd1498Szrj i = thr->ts.team_id;
114*38fd1498Szrj c = ws->chunk_size;
115*38fd1498Szrj
116*38fd1498Szrj /* Initial guess is a C sized chunk positioned nthreads iterations
117*38fd1498Szrj in, offset by our thread number. */
118*38fd1498Szrj s0 = (thr->ts.static_trip * nthreads + i) * c;
119*38fd1498Szrj e0 = s0 + c;
120*38fd1498Szrj
121*38fd1498Szrj /* Detect overflow. */
122*38fd1498Szrj if (s0 >= n)
123*38fd1498Szrj return 1;
124*38fd1498Szrj if (e0 > n)
125*38fd1498Szrj e0 = n;
126*38fd1498Szrj
127*38fd1498Szrj /* Transform these to the actual start and end numbers. */
128*38fd1498Szrj s = (long)s0 * ws->incr + ws->next;
129*38fd1498Szrj e = (long)e0 * ws->incr + ws->next;
130*38fd1498Szrj
131*38fd1498Szrj *pstart = s;
132*38fd1498Szrj *pend = e;
133*38fd1498Szrj
134*38fd1498Szrj if (e0 == n)
135*38fd1498Szrj thr->ts.static_trip = -1;
136*38fd1498Szrj else
137*38fd1498Szrj thr->ts.static_trip++;
138*38fd1498Szrj return 0;
139*38fd1498Szrj }
140*38fd1498Szrj }
141*38fd1498Szrj
142*38fd1498Szrj
143*38fd1498Szrj /* This function implements the DYNAMIC scheduling method. Arguments are
144*38fd1498Szrj as for gomp_iter_static_next. This function must be called with ws->lock
145*38fd1498Szrj held. */
146*38fd1498Szrj
147*38fd1498Szrj bool
gomp_iter_dynamic_next_locked(long * pstart,long * pend)148*38fd1498Szrj gomp_iter_dynamic_next_locked (long *pstart, long *pend)
149*38fd1498Szrj {
150*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
151*38fd1498Szrj struct gomp_work_share *ws = thr->ts.work_share;
152*38fd1498Szrj long start, end, chunk, left;
153*38fd1498Szrj
154*38fd1498Szrj start = ws->next;
155*38fd1498Szrj if (start == ws->end)
156*38fd1498Szrj return false;
157*38fd1498Szrj
158*38fd1498Szrj chunk = ws->chunk_size;
159*38fd1498Szrj left = ws->end - start;
160*38fd1498Szrj if (ws->incr < 0)
161*38fd1498Szrj {
162*38fd1498Szrj if (chunk < left)
163*38fd1498Szrj chunk = left;
164*38fd1498Szrj }
165*38fd1498Szrj else
166*38fd1498Szrj {
167*38fd1498Szrj if (chunk > left)
168*38fd1498Szrj chunk = left;
169*38fd1498Szrj }
170*38fd1498Szrj end = start + chunk;
171*38fd1498Szrj
172*38fd1498Szrj ws->next = end;
173*38fd1498Szrj *pstart = start;
174*38fd1498Szrj *pend = end;
175*38fd1498Szrj return true;
176*38fd1498Szrj }
177*38fd1498Szrj
178*38fd1498Szrj
179*38fd1498Szrj #ifdef HAVE_SYNC_BUILTINS
180*38fd1498Szrj /* Similar, but doesn't require the lock held, and uses compare-and-swap
181*38fd1498Szrj instead. Note that the only memory value that changes is ws->next. */
182*38fd1498Szrj
183*38fd1498Szrj bool
gomp_iter_dynamic_next(long * pstart,long * pend)184*38fd1498Szrj gomp_iter_dynamic_next (long *pstart, long *pend)
185*38fd1498Szrj {
186*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
187*38fd1498Szrj struct gomp_work_share *ws = thr->ts.work_share;
188*38fd1498Szrj long start, end, nend, chunk, incr;
189*38fd1498Szrj
190*38fd1498Szrj end = ws->end;
191*38fd1498Szrj incr = ws->incr;
192*38fd1498Szrj chunk = ws->chunk_size;
193*38fd1498Szrj
194*38fd1498Szrj if (__builtin_expect (ws->mode, 1))
195*38fd1498Szrj {
196*38fd1498Szrj long tmp = __sync_fetch_and_add (&ws->next, chunk);
197*38fd1498Szrj if (incr > 0)
198*38fd1498Szrj {
199*38fd1498Szrj if (tmp >= end)
200*38fd1498Szrj return false;
201*38fd1498Szrj nend = tmp + chunk;
202*38fd1498Szrj if (nend > end)
203*38fd1498Szrj nend = end;
204*38fd1498Szrj *pstart = tmp;
205*38fd1498Szrj *pend = nend;
206*38fd1498Szrj return true;
207*38fd1498Szrj }
208*38fd1498Szrj else
209*38fd1498Szrj {
210*38fd1498Szrj if (tmp <= end)
211*38fd1498Szrj return false;
212*38fd1498Szrj nend = tmp + chunk;
213*38fd1498Szrj if (nend < end)
214*38fd1498Szrj nend = end;
215*38fd1498Szrj *pstart = tmp;
216*38fd1498Szrj *pend = nend;
217*38fd1498Szrj return true;
218*38fd1498Szrj }
219*38fd1498Szrj }
220*38fd1498Szrj
221*38fd1498Szrj start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
222*38fd1498Szrj while (1)
223*38fd1498Szrj {
224*38fd1498Szrj long left = end - start;
225*38fd1498Szrj long tmp;
226*38fd1498Szrj
227*38fd1498Szrj if (start == end)
228*38fd1498Szrj return false;
229*38fd1498Szrj
230*38fd1498Szrj if (incr < 0)
231*38fd1498Szrj {
232*38fd1498Szrj if (chunk < left)
233*38fd1498Szrj chunk = left;
234*38fd1498Szrj }
235*38fd1498Szrj else
236*38fd1498Szrj {
237*38fd1498Szrj if (chunk > left)
238*38fd1498Szrj chunk = left;
239*38fd1498Szrj }
240*38fd1498Szrj nend = start + chunk;
241*38fd1498Szrj
242*38fd1498Szrj tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
243*38fd1498Szrj if (__builtin_expect (tmp == start, 1))
244*38fd1498Szrj break;
245*38fd1498Szrj
246*38fd1498Szrj start = tmp;
247*38fd1498Szrj }
248*38fd1498Szrj
249*38fd1498Szrj *pstart = start;
250*38fd1498Szrj *pend = nend;
251*38fd1498Szrj return true;
252*38fd1498Szrj }
253*38fd1498Szrj #endif /* HAVE_SYNC_BUILTINS */
254*38fd1498Szrj
255*38fd1498Szrj
256*38fd1498Szrj /* This function implements the GUIDED scheduling method. Arguments are
257*38fd1498Szrj as for gomp_iter_static_next. This function must be called with the
258*38fd1498Szrj work share lock held. */
259*38fd1498Szrj
260*38fd1498Szrj bool
gomp_iter_guided_next_locked(long * pstart,long * pend)261*38fd1498Szrj gomp_iter_guided_next_locked (long *pstart, long *pend)
262*38fd1498Szrj {
263*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
264*38fd1498Szrj struct gomp_work_share *ws = thr->ts.work_share;
265*38fd1498Szrj struct gomp_team *team = thr->ts.team;
266*38fd1498Szrj unsigned long nthreads = team ? team->nthreads : 1;
267*38fd1498Szrj unsigned long n, q;
268*38fd1498Szrj long start, end;
269*38fd1498Szrj
270*38fd1498Szrj if (ws->next == ws->end)
271*38fd1498Szrj return false;
272*38fd1498Szrj
273*38fd1498Szrj start = ws->next;
274*38fd1498Szrj n = (ws->end - start) / ws->incr;
275*38fd1498Szrj q = (n + nthreads - 1) / nthreads;
276*38fd1498Szrj
277*38fd1498Szrj if (q < ws->chunk_size)
278*38fd1498Szrj q = ws->chunk_size;
279*38fd1498Szrj if (q <= n)
280*38fd1498Szrj end = start + q * ws->incr;
281*38fd1498Szrj else
282*38fd1498Szrj end = ws->end;
283*38fd1498Szrj
284*38fd1498Szrj ws->next = end;
285*38fd1498Szrj *pstart = start;
286*38fd1498Szrj *pend = end;
287*38fd1498Szrj return true;
288*38fd1498Szrj }
289*38fd1498Szrj
290*38fd1498Szrj #ifdef HAVE_SYNC_BUILTINS
291*38fd1498Szrj /* Similar, but doesn't require the lock held, and uses compare-and-swap
292*38fd1498Szrj instead. Note that the only memory value that changes is ws->next. */
293*38fd1498Szrj
294*38fd1498Szrj bool
gomp_iter_guided_next(long * pstart,long * pend)295*38fd1498Szrj gomp_iter_guided_next (long *pstart, long *pend)
296*38fd1498Szrj {
297*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
298*38fd1498Szrj struct gomp_work_share *ws = thr->ts.work_share;
299*38fd1498Szrj struct gomp_team *team = thr->ts.team;
300*38fd1498Szrj unsigned long nthreads = team ? team->nthreads : 1;
301*38fd1498Szrj long start, end, nend, incr;
302*38fd1498Szrj unsigned long chunk_size;
303*38fd1498Szrj
304*38fd1498Szrj start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
305*38fd1498Szrj end = ws->end;
306*38fd1498Szrj incr = ws->incr;
307*38fd1498Szrj chunk_size = ws->chunk_size;
308*38fd1498Szrj
309*38fd1498Szrj while (1)
310*38fd1498Szrj {
311*38fd1498Szrj unsigned long n, q;
312*38fd1498Szrj long tmp;
313*38fd1498Szrj
314*38fd1498Szrj if (start == end)
315*38fd1498Szrj return false;
316*38fd1498Szrj
317*38fd1498Szrj n = (end - start) / incr;
318*38fd1498Szrj q = (n + nthreads - 1) / nthreads;
319*38fd1498Szrj
320*38fd1498Szrj if (q < chunk_size)
321*38fd1498Szrj q = chunk_size;
322*38fd1498Szrj if (__builtin_expect (q <= n, 1))
323*38fd1498Szrj nend = start + q * incr;
324*38fd1498Szrj else
325*38fd1498Szrj nend = end;
326*38fd1498Szrj
327*38fd1498Szrj tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
328*38fd1498Szrj if (__builtin_expect (tmp == start, 1))
329*38fd1498Szrj break;
330*38fd1498Szrj
331*38fd1498Szrj start = tmp;
332*38fd1498Szrj }
333*38fd1498Szrj
334*38fd1498Szrj *pstart = start;
335*38fd1498Szrj *pend = nend;
336*38fd1498Szrj return true;
337*38fd1498Szrj }
338*38fd1498Szrj #endif /* HAVE_SYNC_BUILTINS */
339