xref: /dflybsd-src/contrib/gcc-8.0/libgomp/iter_ull.c (revision 38fd149817dfbff97799f62fcb70be98c4e32523)
1*38fd1498Szrj /* Copyright (C) 2005-2018 Free Software Foundation, Inc.
2*38fd1498Szrj    Contributed by Richard Henderson <rth@redhat.com>.
3*38fd1498Szrj 
4*38fd1498Szrj    This file is part of the GNU Offloading and Multi Processing Library
5*38fd1498Szrj    (libgomp).
6*38fd1498Szrj 
7*38fd1498Szrj    Libgomp is free software; you can redistribute it and/or modify it
8*38fd1498Szrj    under the terms of the GNU General Public License as published by
9*38fd1498Szrj    the Free Software Foundation; either version 3, or (at your option)
10*38fd1498Szrj    any later version.
11*38fd1498Szrj 
12*38fd1498Szrj    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13*38fd1498Szrj    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14*38fd1498Szrj    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15*38fd1498Szrj    more details.
16*38fd1498Szrj 
17*38fd1498Szrj    Under Section 7 of GPL version 3, you are granted additional
18*38fd1498Szrj    permissions described in the GCC Runtime Library Exception, version
19*38fd1498Szrj    3.1, as published by the Free Software Foundation.
20*38fd1498Szrj 
21*38fd1498Szrj    You should have received a copy of the GNU General Public License and
22*38fd1498Szrj    a copy of the GCC Runtime Library Exception along with this program;
23*38fd1498Szrj    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24*38fd1498Szrj    <http://www.gnu.org/licenses/>.  */
25*38fd1498Szrj 
26*38fd1498Szrj /* This file contains routines for managing work-share iteration, both
27*38fd1498Szrj    for loops and sections.  */
28*38fd1498Szrj 
29*38fd1498Szrj #include "libgomp.h"
30*38fd1498Szrj #include <stdlib.h>
31*38fd1498Szrj 
32*38fd1498Szrj typedef unsigned long long gomp_ull;
33*38fd1498Szrj 
34*38fd1498Szrj /* This function implements the STATIC scheduling method.  The caller should
35*38fd1498Szrj    iterate *pstart <= x < *pend.  Return zero if there are more iterations
36*38fd1498Szrj    to perform; nonzero if not.  Return less than 0 if this thread had
37*38fd1498Szrj    received the absolutely last iteration.  */
38*38fd1498Szrj 
39*38fd1498Szrj int
gomp_iter_ull_static_next(gomp_ull * pstart,gomp_ull * pend)40*38fd1498Szrj gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
41*38fd1498Szrj {
42*38fd1498Szrj   struct gomp_thread *thr = gomp_thread ();
43*38fd1498Szrj   struct gomp_team *team = thr->ts.team;
44*38fd1498Szrj   struct gomp_work_share *ws = thr->ts.work_share;
45*38fd1498Szrj   unsigned long nthreads = team ? team->nthreads : 1;
46*38fd1498Szrj 
47*38fd1498Szrj   if (thr->ts.static_trip == -1)
48*38fd1498Szrj     return -1;
49*38fd1498Szrj 
50*38fd1498Szrj   /* Quick test for degenerate teams and orphaned constructs.  */
51*38fd1498Szrj   if (nthreads == 1)
52*38fd1498Szrj     {
53*38fd1498Szrj       *pstart = ws->next_ull;
54*38fd1498Szrj       *pend = ws->end_ull;
55*38fd1498Szrj       thr->ts.static_trip = -1;
56*38fd1498Szrj       return ws->next_ull == ws->end_ull;
57*38fd1498Szrj     }
58*38fd1498Szrj 
59*38fd1498Szrj   /* We interpret chunk_size zero as "unspecified", which means that we
60*38fd1498Szrj      should break up the iterations such that each thread makes only one
61*38fd1498Szrj      trip through the outer loop.  */
62*38fd1498Szrj   if (ws->chunk_size_ull == 0)
63*38fd1498Szrj     {
64*38fd1498Szrj       gomp_ull n, q, i, t, s0, e0, s, e;
65*38fd1498Szrj 
66*38fd1498Szrj       if (thr->ts.static_trip > 0)
67*38fd1498Szrj 	return 1;
68*38fd1498Szrj 
69*38fd1498Szrj       /* Compute the total number of iterations.  */
70*38fd1498Szrj       if (__builtin_expect (ws->mode, 0) == 0)
71*38fd1498Szrj 	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
72*38fd1498Szrj       else
73*38fd1498Szrj 	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
74*38fd1498Szrj       i = thr->ts.team_id;
75*38fd1498Szrj 
76*38fd1498Szrj       /* Compute the "zero-based" start and end points.  That is, as
77*38fd1498Szrj 	 if the loop began at zero and incremented by one.  */
78*38fd1498Szrj       q = n / nthreads;
79*38fd1498Szrj       t = n % nthreads;
80*38fd1498Szrj       if (i < t)
81*38fd1498Szrj 	{
82*38fd1498Szrj 	  t = 0;
83*38fd1498Szrj 	  q++;
84*38fd1498Szrj 	}
85*38fd1498Szrj       s0 = q * i + t;
86*38fd1498Szrj       e0 = s0 + q;
87*38fd1498Szrj 
88*38fd1498Szrj       /* Notice when no iterations allocated for this thread.  */
89*38fd1498Szrj       if (s0 >= e0)
90*38fd1498Szrj 	{
91*38fd1498Szrj 	  thr->ts.static_trip = 1;
92*38fd1498Szrj 	  return 1;
93*38fd1498Szrj 	}
94*38fd1498Szrj 
95*38fd1498Szrj       /* Transform these to the actual start and end numbers.  */
96*38fd1498Szrj       s = s0 * ws->incr_ull + ws->next_ull;
97*38fd1498Szrj       e = e0 * ws->incr_ull + ws->next_ull;
98*38fd1498Szrj 
99*38fd1498Szrj       *pstart = s;
100*38fd1498Szrj       *pend = e;
101*38fd1498Szrj       thr->ts.static_trip = (e0 == n ? -1 : 1);
102*38fd1498Szrj       return 0;
103*38fd1498Szrj     }
104*38fd1498Szrj   else
105*38fd1498Szrj     {
106*38fd1498Szrj       gomp_ull n, s0, e0, i, c, s, e;
107*38fd1498Szrj 
108*38fd1498Szrj       /* Otherwise, each thread gets exactly chunk_size iterations
109*38fd1498Szrj 	 (if available) each time through the loop.  */
110*38fd1498Szrj 
111*38fd1498Szrj       if (__builtin_expect (ws->mode, 0) == 0)
112*38fd1498Szrj 	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
113*38fd1498Szrj       else
114*38fd1498Szrj 	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
115*38fd1498Szrj       i = thr->ts.team_id;
116*38fd1498Szrj       c = ws->chunk_size_ull;
117*38fd1498Szrj 
118*38fd1498Szrj       /* Initial guess is a C sized chunk positioned nthreads iterations
119*38fd1498Szrj 	 in, offset by our thread number.  */
120*38fd1498Szrj       s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
121*38fd1498Szrj       e0 = s0 + c;
122*38fd1498Szrj 
123*38fd1498Szrj       /* Detect overflow.  */
124*38fd1498Szrj       if (s0 >= n)
125*38fd1498Szrj 	return 1;
126*38fd1498Szrj       if (e0 > n)
127*38fd1498Szrj 	e0 = n;
128*38fd1498Szrj 
129*38fd1498Szrj       /* Transform these to the actual start and end numbers.  */
130*38fd1498Szrj       s = s0 * ws->incr_ull + ws->next_ull;
131*38fd1498Szrj       e = e0 * ws->incr_ull + ws->next_ull;
132*38fd1498Szrj 
133*38fd1498Szrj       *pstart = s;
134*38fd1498Szrj       *pend = e;
135*38fd1498Szrj 
136*38fd1498Szrj       if (e0 == n)
137*38fd1498Szrj 	thr->ts.static_trip = -1;
138*38fd1498Szrj       else
139*38fd1498Szrj 	thr->ts.static_trip++;
140*38fd1498Szrj       return 0;
141*38fd1498Szrj     }
142*38fd1498Szrj }
143*38fd1498Szrj 
144*38fd1498Szrj 
145*38fd1498Szrj /* This function implements the DYNAMIC scheduling method.  Arguments are
146*38fd1498Szrj    as for gomp_iter_ull_static_next.  This function must be called with
147*38fd1498Szrj    ws->lock held.  */
148*38fd1498Szrj 
149*38fd1498Szrj bool
gomp_iter_ull_dynamic_next_locked(gomp_ull * pstart,gomp_ull * pend)150*38fd1498Szrj gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
151*38fd1498Szrj {
152*38fd1498Szrj   struct gomp_thread *thr = gomp_thread ();
153*38fd1498Szrj   struct gomp_work_share *ws = thr->ts.work_share;
154*38fd1498Szrj   gomp_ull start, end, chunk, left;
155*38fd1498Szrj 
156*38fd1498Szrj   start = ws->next_ull;
157*38fd1498Szrj   if (start == ws->end_ull)
158*38fd1498Szrj     return false;
159*38fd1498Szrj 
160*38fd1498Szrj   chunk = ws->chunk_size_ull;
161*38fd1498Szrj   left = ws->end_ull - start;
162*38fd1498Szrj   if (__builtin_expect (ws->mode & 2, 0))
163*38fd1498Szrj     {
164*38fd1498Szrj       if (chunk < left)
165*38fd1498Szrj 	chunk = left;
166*38fd1498Szrj     }
167*38fd1498Szrj   else
168*38fd1498Szrj     {
169*38fd1498Szrj       if (chunk > left)
170*38fd1498Szrj 	chunk = left;
171*38fd1498Szrj     }
172*38fd1498Szrj   end = start + chunk;
173*38fd1498Szrj 
174*38fd1498Szrj   ws->next_ull = end;
175*38fd1498Szrj   *pstart = start;
176*38fd1498Szrj   *pend = end;
177*38fd1498Szrj   return true;
178*38fd1498Szrj }
179*38fd1498Szrj 
180*38fd1498Szrj 
181*38fd1498Szrj #if defined HAVE_SYNC_BUILTINS && defined __LP64__
182*38fd1498Szrj /* Similar, but doesn't require the lock held, and uses compare-and-swap
183*38fd1498Szrj    instead.  Note that the only memory value that changes is ws->next_ull.  */
184*38fd1498Szrj 
185*38fd1498Szrj bool
gomp_iter_ull_dynamic_next(gomp_ull * pstart,gomp_ull * pend)186*38fd1498Szrj gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
187*38fd1498Szrj {
188*38fd1498Szrj   struct gomp_thread *thr = gomp_thread ();
189*38fd1498Szrj   struct gomp_work_share *ws = thr->ts.work_share;
190*38fd1498Szrj   gomp_ull start, end, nend, chunk;
191*38fd1498Szrj 
192*38fd1498Szrj   end = ws->end_ull;
193*38fd1498Szrj   chunk = ws->chunk_size_ull;
194*38fd1498Szrj 
195*38fd1498Szrj   if (__builtin_expect (ws->mode & 1, 1))
196*38fd1498Szrj     {
197*38fd1498Szrj       gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
198*38fd1498Szrj       if (__builtin_expect (ws->mode & 2, 0) == 0)
199*38fd1498Szrj 	{
200*38fd1498Szrj 	  if (tmp >= end)
201*38fd1498Szrj 	    return false;
202*38fd1498Szrj 	  nend = tmp + chunk;
203*38fd1498Szrj 	  if (nend > end)
204*38fd1498Szrj 	    nend = end;
205*38fd1498Szrj 	  *pstart = tmp;
206*38fd1498Szrj 	  *pend = nend;
207*38fd1498Szrj 	  return true;
208*38fd1498Szrj 	}
209*38fd1498Szrj       else
210*38fd1498Szrj 	{
211*38fd1498Szrj 	  if (tmp <= end)
212*38fd1498Szrj 	    return false;
213*38fd1498Szrj 	  nend = tmp + chunk;
214*38fd1498Szrj 	  if (nend < end)
215*38fd1498Szrj 	    nend = end;
216*38fd1498Szrj 	  *pstart = tmp;
217*38fd1498Szrj 	  *pend = nend;
218*38fd1498Szrj 	  return true;
219*38fd1498Szrj 	}
220*38fd1498Szrj     }
221*38fd1498Szrj 
222*38fd1498Szrj   start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
223*38fd1498Szrj   while (1)
224*38fd1498Szrj     {
225*38fd1498Szrj       gomp_ull left = end - start;
226*38fd1498Szrj       gomp_ull tmp;
227*38fd1498Szrj 
228*38fd1498Szrj       if (start == end)
229*38fd1498Szrj 	return false;
230*38fd1498Szrj 
231*38fd1498Szrj       if (__builtin_expect (ws->mode & 2, 0))
232*38fd1498Szrj 	{
233*38fd1498Szrj 	  if (chunk < left)
234*38fd1498Szrj 	    chunk = left;
235*38fd1498Szrj 	}
236*38fd1498Szrj       else
237*38fd1498Szrj 	{
238*38fd1498Szrj 	  if (chunk > left)
239*38fd1498Szrj 	    chunk = left;
240*38fd1498Szrj 	}
241*38fd1498Szrj       nend = start + chunk;
242*38fd1498Szrj 
243*38fd1498Szrj       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
244*38fd1498Szrj       if (__builtin_expect (tmp == start, 1))
245*38fd1498Szrj 	break;
246*38fd1498Szrj 
247*38fd1498Szrj       start = tmp;
248*38fd1498Szrj     }
249*38fd1498Szrj 
250*38fd1498Szrj   *pstart = start;
251*38fd1498Szrj   *pend = nend;
252*38fd1498Szrj   return true;
253*38fd1498Szrj }
254*38fd1498Szrj #endif /* HAVE_SYNC_BUILTINS */
255*38fd1498Szrj 
256*38fd1498Szrj 
257*38fd1498Szrj /* This function implements the GUIDED scheduling method.  Arguments are
258*38fd1498Szrj    as for gomp_iter_ull_static_next.  This function must be called with the
259*38fd1498Szrj    work share lock held.  */
260*38fd1498Szrj 
261*38fd1498Szrj bool
gomp_iter_ull_guided_next_locked(gomp_ull * pstart,gomp_ull * pend)262*38fd1498Szrj gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
263*38fd1498Szrj {
264*38fd1498Szrj   struct gomp_thread *thr = gomp_thread ();
265*38fd1498Szrj   struct gomp_work_share *ws = thr->ts.work_share;
266*38fd1498Szrj   struct gomp_team *team = thr->ts.team;
267*38fd1498Szrj   gomp_ull nthreads = team ? team->nthreads : 1;
268*38fd1498Szrj   gomp_ull n, q;
269*38fd1498Szrj   gomp_ull start, end;
270*38fd1498Szrj 
271*38fd1498Szrj   if (ws->next_ull == ws->end_ull)
272*38fd1498Szrj     return false;
273*38fd1498Szrj 
274*38fd1498Szrj   start = ws->next_ull;
275*38fd1498Szrj   if (__builtin_expect (ws->mode, 0) == 0)
276*38fd1498Szrj     n = (ws->end_ull - start) / ws->incr_ull;
277*38fd1498Szrj   else
278*38fd1498Szrj     n = (start - ws->end_ull) / -ws->incr_ull;
279*38fd1498Szrj   q = (n + nthreads - 1) / nthreads;
280*38fd1498Szrj 
281*38fd1498Szrj   if (q < ws->chunk_size_ull)
282*38fd1498Szrj     q = ws->chunk_size_ull;
283*38fd1498Szrj   if (q <= n)
284*38fd1498Szrj     end = start + q * ws->incr_ull;
285*38fd1498Szrj   else
286*38fd1498Szrj     end = ws->end_ull;
287*38fd1498Szrj 
288*38fd1498Szrj   ws->next_ull = end;
289*38fd1498Szrj   *pstart = start;
290*38fd1498Szrj   *pend = end;
291*38fd1498Szrj   return true;
292*38fd1498Szrj }
293*38fd1498Szrj 
294*38fd1498Szrj #if defined HAVE_SYNC_BUILTINS && defined __LP64__
295*38fd1498Szrj /* Similar, but doesn't require the lock held, and uses compare-and-swap
296*38fd1498Szrj    instead.  Note that the only memory value that changes is ws->next_ull.  */
297*38fd1498Szrj 
298*38fd1498Szrj bool
gomp_iter_ull_guided_next(gomp_ull * pstart,gomp_ull * pend)299*38fd1498Szrj gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
300*38fd1498Szrj {
301*38fd1498Szrj   struct gomp_thread *thr = gomp_thread ();
302*38fd1498Szrj   struct gomp_work_share *ws = thr->ts.work_share;
303*38fd1498Szrj   struct gomp_team *team = thr->ts.team;
304*38fd1498Szrj   gomp_ull nthreads = team ? team->nthreads : 1;
305*38fd1498Szrj   gomp_ull start, end, nend, incr;
306*38fd1498Szrj   gomp_ull chunk_size;
307*38fd1498Szrj 
308*38fd1498Szrj   start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
309*38fd1498Szrj   end = ws->end_ull;
310*38fd1498Szrj   incr = ws->incr_ull;
311*38fd1498Szrj   chunk_size = ws->chunk_size_ull;
312*38fd1498Szrj 
313*38fd1498Szrj   while (1)
314*38fd1498Szrj     {
315*38fd1498Szrj       gomp_ull n, q;
316*38fd1498Szrj       gomp_ull tmp;
317*38fd1498Szrj 
318*38fd1498Szrj       if (start == end)
319*38fd1498Szrj 	return false;
320*38fd1498Szrj 
321*38fd1498Szrj       if (__builtin_expect (ws->mode, 0) == 0)
322*38fd1498Szrj 	n = (end - start) / incr;
323*38fd1498Szrj       else
324*38fd1498Szrj 	n = (start - end) / -incr;
325*38fd1498Szrj       q = (n + nthreads - 1) / nthreads;
326*38fd1498Szrj 
327*38fd1498Szrj       if (q < chunk_size)
328*38fd1498Szrj 	q = chunk_size;
329*38fd1498Szrj       if (__builtin_expect (q <= n, 1))
330*38fd1498Szrj 	nend = start + q * incr;
331*38fd1498Szrj       else
332*38fd1498Szrj 	nend = end;
333*38fd1498Szrj 
334*38fd1498Szrj       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
335*38fd1498Szrj       if (__builtin_expect (tmp == start, 1))
336*38fd1498Szrj 	break;
337*38fd1498Szrj 
338*38fd1498Szrj       start = tmp;
339*38fd1498Szrj     }
340*38fd1498Szrj 
341*38fd1498Szrj   *pstart = start;
342*38fd1498Szrj   *pend = nend;
343*38fd1498Szrj   return true;
344*38fd1498Szrj }
345*38fd1498Szrj #endif /* HAVE_SYNC_BUILTINS */
346