xref: /dflybsd-src/contrib/gcc-8.0/libgomp/ordered.c (revision 38fd149817dfbff97799f62fcb70be98c4e32523)
1*38fd1498Szrj /* Copyright (C) 2005-2018 Free Software Foundation, Inc.
2*38fd1498Szrj    Contributed by Richard Henderson <rth@redhat.com>.
3*38fd1498Szrj 
4*38fd1498Szrj    This file is part of the GNU Offloading and Multi Processing Library
5*38fd1498Szrj    (libgomp).
6*38fd1498Szrj 
7*38fd1498Szrj    Libgomp is free software; you can redistribute it and/or modify it
8*38fd1498Szrj    under the terms of the GNU General Public License as published by
9*38fd1498Szrj    the Free Software Foundation; either version 3, or (at your option)
10*38fd1498Szrj    any later version.
11*38fd1498Szrj 
12*38fd1498Szrj    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13*38fd1498Szrj    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14*38fd1498Szrj    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15*38fd1498Szrj    more details.
16*38fd1498Szrj 
17*38fd1498Szrj    Under Section 7 of GPL version 3, you are granted additional
18*38fd1498Szrj    permissions described in the GCC Runtime Library Exception, version
19*38fd1498Szrj    3.1, as published by the Free Software Foundation.
20*38fd1498Szrj 
21*38fd1498Szrj    You should have received a copy of the GNU General Public License and
22*38fd1498Szrj    a copy of the GCC Runtime Library Exception along with this program;
23*38fd1498Szrj    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24*38fd1498Szrj    <http://www.gnu.org/licenses/>.  */
25*38fd1498Szrj 
26*38fd1498Szrj /* This file handles the ORDERED construct.  */
27*38fd1498Szrj 
28*38fd1498Szrj #include "libgomp.h"
29*38fd1498Szrj #include <stdarg.h>
30*38fd1498Szrj #include <string.h>
31*38fd1498Szrj #include "doacross.h"
32*38fd1498Szrj 
33*38fd1498Szrj 
34*38fd1498Szrj /* This function is called when first allocating an iteration block.  That
35*38fd1498Szrj    is, the thread is not currently on the queue.  The work-share lock must
36*38fd1498Szrj    be held on entry.  */
37*38fd1498Szrj 
38*38fd1498Szrj void
gomp_ordered_first(void)39*38fd1498Szrj gomp_ordered_first (void)
40*38fd1498Szrj {
41*38fd1498Szrj   struct gomp_thread *thr = gomp_thread ();
42*38fd1498Szrj   struct gomp_team *team = thr->ts.team;
43*38fd1498Szrj   struct gomp_work_share *ws = thr->ts.work_share;
44*38fd1498Szrj   unsigned index;
45*38fd1498Szrj 
46*38fd1498Szrj   /* Work share constructs can be orphaned.  */
47*38fd1498Szrj   if (team == NULL || team->nthreads == 1)
48*38fd1498Szrj     return;
49*38fd1498Szrj 
50*38fd1498Szrj   index = ws->ordered_cur + ws->ordered_num_used;
51*38fd1498Szrj   if (index >= team->nthreads)
52*38fd1498Szrj     index -= team->nthreads;
53*38fd1498Szrj   ws->ordered_team_ids[index] = thr->ts.team_id;
54*38fd1498Szrj 
55*38fd1498Szrj   /* If this is the first and only thread in the queue, then there is
56*38fd1498Szrj      no one to release us when we get to our ordered section.  Post to
57*38fd1498Szrj      our own release queue now so that we won't block later.  */
58*38fd1498Szrj   if (ws->ordered_num_used++ == 0)
59*38fd1498Szrj     gomp_sem_post (team->ordered_release[thr->ts.team_id]);
60*38fd1498Szrj }
61*38fd1498Szrj 
62*38fd1498Szrj /* This function is called when completing the last iteration block.  That
63*38fd1498Szrj    is, there are no more iterations to perform and so the thread should be
64*38fd1498Szrj    removed from the queue entirely.  Because of the way ORDERED blocks are
65*38fd1498Szrj    managed, it follows that we currently own access to the ORDERED block,
66*38fd1498Szrj    and should now pass it on to the next thread.  The work-share lock must
67*38fd1498Szrj    be held on entry.  */
68*38fd1498Szrj 
69*38fd1498Szrj void
gomp_ordered_last(void)70*38fd1498Szrj gomp_ordered_last (void)
71*38fd1498Szrj {
72*38fd1498Szrj   struct gomp_thread *thr = gomp_thread ();
73*38fd1498Szrj   struct gomp_team *team = thr->ts.team;
74*38fd1498Szrj   struct gomp_work_share *ws = thr->ts.work_share;
75*38fd1498Szrj   unsigned next_id;
76*38fd1498Szrj 
77*38fd1498Szrj   /* Work share constructs can be orphaned.  */
78*38fd1498Szrj   if (team == NULL || team->nthreads == 1)
79*38fd1498Szrj     return;
80*38fd1498Szrj 
81*38fd1498Szrj   /* We're no longer the owner.  */
82*38fd1498Szrj   ws->ordered_owner = -1;
83*38fd1498Szrj 
84*38fd1498Szrj   /* If we're not the last thread in the queue, then wake the next.  */
85*38fd1498Szrj   if (--ws->ordered_num_used > 0)
86*38fd1498Szrj     {
87*38fd1498Szrj       unsigned next = ws->ordered_cur + 1;
88*38fd1498Szrj       if (next == team->nthreads)
89*38fd1498Szrj 	next = 0;
90*38fd1498Szrj       ws->ordered_cur = next;
91*38fd1498Szrj 
92*38fd1498Szrj       next_id = ws->ordered_team_ids[next];
93*38fd1498Szrj       gomp_sem_post (team->ordered_release[next_id]);
94*38fd1498Szrj     }
95*38fd1498Szrj }
96*38fd1498Szrj 
97*38fd1498Szrj 
98*38fd1498Szrj /* This function is called when allocating a subsequent allocation block.
99*38fd1498Szrj    That is, we're done with the current iteration block and we're allocating
100*38fd1498Szrj    another.  This is the logical combination of a call to gomp_ordered_last
101*38fd1498Szrj    followed by a call to gomp_ordered_first.  The work-share lock must be
102*38fd1498Szrj    held on entry. */
103*38fd1498Szrj 
104*38fd1498Szrj void
gomp_ordered_next(void)105*38fd1498Szrj gomp_ordered_next (void)
106*38fd1498Szrj {
107*38fd1498Szrj   struct gomp_thread *thr = gomp_thread ();
108*38fd1498Szrj   struct gomp_team *team = thr->ts.team;
109*38fd1498Szrj   struct gomp_work_share *ws = thr->ts.work_share;
110*38fd1498Szrj   unsigned index, next_id;
111*38fd1498Szrj 
112*38fd1498Szrj   /* Work share constructs can be orphaned.  */
113*38fd1498Szrj   if (team == NULL || team->nthreads == 1)
114*38fd1498Szrj     return;
115*38fd1498Szrj 
116*38fd1498Szrj   /* We're no longer the owner.  */
117*38fd1498Szrj   ws->ordered_owner = -1;
118*38fd1498Szrj 
119*38fd1498Szrj   /* If there's only one thread in the queue, that must be us.  */
120*38fd1498Szrj   if (ws->ordered_num_used == 1)
121*38fd1498Szrj     {
122*38fd1498Szrj       /* We have a similar situation as in gomp_ordered_first
123*38fd1498Szrj 	 where we need to post to our own release semaphore.  */
124*38fd1498Szrj       gomp_sem_post (team->ordered_release[thr->ts.team_id]);
125*38fd1498Szrj       return;
126*38fd1498Szrj     }
127*38fd1498Szrj 
128*38fd1498Szrj   /* If the queue is entirely full, then we move ourself to the end of
129*38fd1498Szrj      the queue merely by incrementing ordered_cur.  Only if it's not
130*38fd1498Szrj      full do we have to write our id.  */
131*38fd1498Szrj   if (ws->ordered_num_used < team->nthreads)
132*38fd1498Szrj     {
133*38fd1498Szrj       index = ws->ordered_cur + ws->ordered_num_used;
134*38fd1498Szrj       if (index >= team->nthreads)
135*38fd1498Szrj 	index -= team->nthreads;
136*38fd1498Szrj       ws->ordered_team_ids[index] = thr->ts.team_id;
137*38fd1498Szrj     }
138*38fd1498Szrj 
139*38fd1498Szrj   index = ws->ordered_cur + 1;
140*38fd1498Szrj   if (index == team->nthreads)
141*38fd1498Szrj     index = 0;
142*38fd1498Szrj   ws->ordered_cur = index;
143*38fd1498Szrj 
144*38fd1498Szrj   next_id = ws->ordered_team_ids[index];
145*38fd1498Szrj   gomp_sem_post (team->ordered_release[next_id]);
146*38fd1498Szrj }
147*38fd1498Szrj 
148*38fd1498Szrj 
149*38fd1498Szrj /* This function is called when a statically scheduled loop is first
150*38fd1498Szrj    being created.  */
151*38fd1498Szrj 
152*38fd1498Szrj void
gomp_ordered_static_init(void)153*38fd1498Szrj gomp_ordered_static_init (void)
154*38fd1498Szrj {
155*38fd1498Szrj   struct gomp_thread *thr = gomp_thread ();
156*38fd1498Szrj   struct gomp_team *team = thr->ts.team;
157*38fd1498Szrj 
158*38fd1498Szrj   if (team == NULL || team->nthreads == 1)
159*38fd1498Szrj     return;
160*38fd1498Szrj 
161*38fd1498Szrj   gomp_sem_post (team->ordered_release[0]);
162*38fd1498Szrj }
163*38fd1498Szrj 
164*38fd1498Szrj /* This function is called when a statically scheduled loop is moving to
165*38fd1498Szrj    the next allocation block.  Static schedules are not first come first
166*38fd1498Szrj    served like the others, so we're to move to the numerically next thread,
167*38fd1498Szrj    not the next thread on a list.  The work-share lock should *not* be held
168*38fd1498Szrj    on entry.  */
169*38fd1498Szrj 
170*38fd1498Szrj void
gomp_ordered_static_next(void)171*38fd1498Szrj gomp_ordered_static_next (void)
172*38fd1498Szrj {
173*38fd1498Szrj   struct gomp_thread *thr = gomp_thread ();
174*38fd1498Szrj   struct gomp_team *team = thr->ts.team;
175*38fd1498Szrj   struct gomp_work_share *ws = thr->ts.work_share;
176*38fd1498Szrj   unsigned id = thr->ts.team_id;
177*38fd1498Szrj 
178*38fd1498Szrj   if (team == NULL || team->nthreads == 1)
179*38fd1498Szrj     return;
180*38fd1498Szrj 
181*38fd1498Szrj   ws->ordered_owner = -1;
182*38fd1498Szrj 
183*38fd1498Szrj   /* This thread currently owns the lock.  Increment the owner.  */
184*38fd1498Szrj   if (++id == team->nthreads)
185*38fd1498Szrj     id = 0;
186*38fd1498Szrj   ws->ordered_team_ids[0] = id;
187*38fd1498Szrj   gomp_sem_post (team->ordered_release[id]);
188*38fd1498Szrj }
189*38fd1498Szrj 
190*38fd1498Szrj /* This function is called when we need to assert that the thread owns the
191*38fd1498Szrj    ordered section.  Due to the problem of posted-but-not-waited semaphores,
192*38fd1498Szrj    this needs to happen before completing a loop iteration.  */
193*38fd1498Szrj 
194*38fd1498Szrj void
gomp_ordered_sync(void)195*38fd1498Szrj gomp_ordered_sync (void)
196*38fd1498Szrj {
197*38fd1498Szrj   struct gomp_thread *thr = gomp_thread ();
198*38fd1498Szrj   struct gomp_team *team = thr->ts.team;
199*38fd1498Szrj   struct gomp_work_share *ws = thr->ts.work_share;
200*38fd1498Szrj 
201*38fd1498Szrj   /* Work share constructs can be orphaned.  But this clearly means that
202*38fd1498Szrj      we are the only thread, and so we automatically own the section.  */
203*38fd1498Szrj   if (team == NULL || team->nthreads == 1)
204*38fd1498Szrj     return;
205*38fd1498Szrj 
206*38fd1498Szrj   /* ??? I believe it to be safe to access this data without taking the
207*38fd1498Szrj      ws->lock.  The only presumed race condition is with the previous
208*38fd1498Szrj      thread on the queue incrementing ordered_cur such that it points
209*38fd1498Szrj      to us, concurrently with our check below.  But our team_id is
210*38fd1498Szrj      already present in the queue, and the other thread will always
211*38fd1498Szrj      post to our release semaphore.  So the two cases are that we will
212*38fd1498Szrj      either win the race an momentarily block on the semaphore, or lose
213*38fd1498Szrj      the race and find the semaphore already unlocked and so not block.
214*38fd1498Szrj      Either way we get correct results.
215*38fd1498Szrj      However, there is an implicit flush on entry to an ordered region,
216*38fd1498Szrj      so we do need to have a barrier here.  If we were taking a lock
217*38fd1498Szrj      this could be MEMMODEL_RELEASE since the acquire would be coverd
218*38fd1498Szrj      by the lock.  */
219*38fd1498Szrj 
220*38fd1498Szrj   __atomic_thread_fence (MEMMODEL_ACQ_REL);
221*38fd1498Szrj   if (ws->ordered_owner != thr->ts.team_id)
222*38fd1498Szrj     {
223*38fd1498Szrj       gomp_sem_wait (team->ordered_release[thr->ts.team_id]);
224*38fd1498Szrj       ws->ordered_owner = thr->ts.team_id;
225*38fd1498Szrj     }
226*38fd1498Szrj }
227*38fd1498Szrj 
228*38fd1498Szrj /* This function is called by user code when encountering the start of an
229*38fd1498Szrj    ORDERED block.  We must check to see if the current thread is at the
230*38fd1498Szrj    head of the queue, and if not, block.  */
231*38fd1498Szrj 
232*38fd1498Szrj #ifdef HAVE_ATTRIBUTE_ALIAS
233*38fd1498Szrj extern void GOMP_ordered_start (void)
234*38fd1498Szrj 	__attribute__((alias ("gomp_ordered_sync")));
235*38fd1498Szrj #else
236*38fd1498Szrj void
GOMP_ordered_start(void)237*38fd1498Szrj GOMP_ordered_start (void)
238*38fd1498Szrj {
239*38fd1498Szrj   gomp_ordered_sync ();
240*38fd1498Szrj }
241*38fd1498Szrj #endif
242*38fd1498Szrj 
243*38fd1498Szrj /* This function is called by user code when encountering the end of an
244*38fd1498Szrj    ORDERED block.  With the current ORDERED implementation there's nothing
245*38fd1498Szrj    for us to do.
246*38fd1498Szrj 
247*38fd1498Szrj    However, the current implementation has a flaw in that it does not allow
248*38fd1498Szrj    the next thread into the ORDERED section immediately after the current
249*38fd1498Szrj    thread exits the ORDERED section in its last iteration.  The existance
250*38fd1498Szrj    of this function allows the implementation to change.  */
251*38fd1498Szrj 
252*38fd1498Szrj void
GOMP_ordered_end(void)253*38fd1498Szrj GOMP_ordered_end (void)
254*38fd1498Szrj {
255*38fd1498Szrj }
256*38fd1498Szrj 
257*38fd1498Szrj /* DOACROSS initialization.  */
258*38fd1498Szrj 
259*38fd1498Szrj #define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
260*38fd1498Szrj 
261*38fd1498Szrj void
gomp_doacross_init(unsigned ncounts,long * counts,long chunk_size)262*38fd1498Szrj gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size)
263*38fd1498Szrj {
264*38fd1498Szrj   struct gomp_thread *thr = gomp_thread ();
265*38fd1498Szrj   struct gomp_team *team = thr->ts.team;
266*38fd1498Szrj   struct gomp_work_share *ws = thr->ts.work_share;
267*38fd1498Szrj   unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
268*38fd1498Szrj   unsigned long ent, num_ents, elt_sz, shift_sz;
269*38fd1498Szrj   struct gomp_doacross_work_share *doacross;
270*38fd1498Szrj 
271*38fd1498Szrj   if (team == NULL || team->nthreads == 1)
272*38fd1498Szrj     return;
273*38fd1498Szrj 
274*38fd1498Szrj   for (i = 0; i < ncounts; i++)
275*38fd1498Szrj     {
276*38fd1498Szrj       /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
277*38fd1498Szrj       if (counts[i] == 0)
278*38fd1498Szrj 	return;
279*38fd1498Szrj 
280*38fd1498Szrj       if (num_bits <= MAX_COLLAPSED_BITS)
281*38fd1498Szrj 	{
282*38fd1498Szrj 	  unsigned int this_bits;
283*38fd1498Szrj 	  if (counts[i] == 1)
284*38fd1498Szrj 	    this_bits = 1;
285*38fd1498Szrj 	  else
286*38fd1498Szrj 	    this_bits = __SIZEOF_LONG__ * __CHAR_BIT__
287*38fd1498Szrj 			- __builtin_clzl (counts[i] - 1);
288*38fd1498Szrj 	  if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
289*38fd1498Szrj 	    {
290*38fd1498Szrj 	      bits[i] = this_bits;
291*38fd1498Szrj 	      num_bits += this_bits;
292*38fd1498Szrj 	    }
293*38fd1498Szrj 	  else
294*38fd1498Szrj 	    num_bits = MAX_COLLAPSED_BITS + 1;
295*38fd1498Szrj 	}
296*38fd1498Szrj     }
297*38fd1498Szrj 
298*38fd1498Szrj   if (ws->sched == GFS_STATIC)
299*38fd1498Szrj     num_ents = team->nthreads;
300*38fd1498Szrj   else if (ws->sched == GFS_GUIDED)
301*38fd1498Szrj     num_ents = counts[0];
302*38fd1498Szrj   else
303*38fd1498Szrj     num_ents = (counts[0] - 1) / chunk_size + 1;
304*38fd1498Szrj   if (num_bits <= MAX_COLLAPSED_BITS)
305*38fd1498Szrj     {
306*38fd1498Szrj       elt_sz = sizeof (unsigned long);
307*38fd1498Szrj       shift_sz = ncounts * sizeof (unsigned int);
308*38fd1498Szrj     }
309*38fd1498Szrj   else
310*38fd1498Szrj     {
311*38fd1498Szrj       elt_sz = sizeof (unsigned long) * ncounts;
312*38fd1498Szrj       shift_sz = 0;
313*38fd1498Szrj     }
314*38fd1498Szrj   elt_sz = (elt_sz + 63) & ~63UL;
315*38fd1498Szrj 
316*38fd1498Szrj   doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
317*38fd1498Szrj 			  + shift_sz);
318*38fd1498Szrj   doacross->chunk_size = chunk_size;
319*38fd1498Szrj   doacross->elt_sz = elt_sz;
320*38fd1498Szrj   doacross->ncounts = ncounts;
321*38fd1498Szrj   doacross->flattened = false;
322*38fd1498Szrj   doacross->array = (unsigned char *)
323*38fd1498Szrj 		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
324*38fd1498Szrj 		     & ~(uintptr_t) 63);
325*38fd1498Szrj   if (num_bits <= MAX_COLLAPSED_BITS)
326*38fd1498Szrj     {
327*38fd1498Szrj       unsigned int shift_count = 0;
328*38fd1498Szrj       doacross->flattened = true;
329*38fd1498Szrj       for (i = ncounts; i > 0; i--)
330*38fd1498Szrj 	{
331*38fd1498Szrj 	  doacross->shift_counts[i - 1] = shift_count;
332*38fd1498Szrj 	  shift_count += bits[i - 1];
333*38fd1498Szrj 	}
334*38fd1498Szrj       for (ent = 0; ent < num_ents; ent++)
335*38fd1498Szrj 	*(unsigned long *) (doacross->array + ent * elt_sz) = 0;
336*38fd1498Szrj     }
337*38fd1498Szrj   else
338*38fd1498Szrj     for (ent = 0; ent < num_ents; ent++)
339*38fd1498Szrj       memset (doacross->array + ent * elt_sz, '\0',
340*38fd1498Szrj 	      sizeof (unsigned long) * ncounts);
341*38fd1498Szrj   if (ws->sched == GFS_STATIC && chunk_size == 0)
342*38fd1498Szrj     {
343*38fd1498Szrj       unsigned long q = counts[0] / num_ents;
344*38fd1498Szrj       unsigned long t = counts[0] % num_ents;
345*38fd1498Szrj       doacross->boundary = t * (q + 1);
346*38fd1498Szrj       doacross->q = q;
347*38fd1498Szrj       doacross->t = t;
348*38fd1498Szrj     }
349*38fd1498Szrj   ws->doacross = doacross;
350*38fd1498Szrj }
351*38fd1498Szrj 
352*38fd1498Szrj /* DOACROSS POST operation.  */
353*38fd1498Szrj 
354*38fd1498Szrj void
GOMP_doacross_post(long * counts)355*38fd1498Szrj GOMP_doacross_post (long *counts)
356*38fd1498Szrj {
357*38fd1498Szrj   struct gomp_thread *thr = gomp_thread ();
358*38fd1498Szrj   struct gomp_work_share *ws = thr->ts.work_share;
359*38fd1498Szrj   struct gomp_doacross_work_share *doacross = ws->doacross;
360*38fd1498Szrj   unsigned long ent;
361*38fd1498Szrj   unsigned int i;
362*38fd1498Szrj 
363*38fd1498Szrj   if (__builtin_expect (doacross == NULL, 0))
364*38fd1498Szrj     {
365*38fd1498Szrj       __sync_synchronize ();
366*38fd1498Szrj       return;
367*38fd1498Szrj     }
368*38fd1498Szrj 
369*38fd1498Szrj   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
370*38fd1498Szrj     ent = thr->ts.team_id;
371*38fd1498Szrj   else if (ws->sched == GFS_GUIDED)
372*38fd1498Szrj     ent = counts[0];
373*38fd1498Szrj   else
374*38fd1498Szrj     ent = counts[0] / doacross->chunk_size;
375*38fd1498Szrj   unsigned long *array = (unsigned long *) (doacross->array
376*38fd1498Szrj 					    + ent * doacross->elt_sz);
377*38fd1498Szrj 
378*38fd1498Szrj   if (__builtin_expect (doacross->flattened, 1))
379*38fd1498Szrj     {
380*38fd1498Szrj       unsigned long flattened
381*38fd1498Szrj 	= (unsigned long) counts[0] << doacross->shift_counts[0];
382*38fd1498Szrj 
383*38fd1498Szrj       for (i = 1; i < doacross->ncounts; i++)
384*38fd1498Szrj 	flattened |= (unsigned long) counts[i]
385*38fd1498Szrj 		     << doacross->shift_counts[i];
386*38fd1498Szrj       flattened++;
387*38fd1498Szrj       if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
388*38fd1498Szrj 	__atomic_thread_fence (MEMMODEL_RELEASE);
389*38fd1498Szrj       else
390*38fd1498Szrj 	__atomic_store_n (array, flattened, MEMMODEL_RELEASE);
391*38fd1498Szrj       return;
392*38fd1498Szrj     }
393*38fd1498Szrj 
394*38fd1498Szrj   __atomic_thread_fence (MEMMODEL_ACQUIRE);
395*38fd1498Szrj   for (i = doacross->ncounts; i-- > 0; )
396*38fd1498Szrj     {
397*38fd1498Szrj       if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
398*38fd1498Szrj 	__atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
399*38fd1498Szrj     }
400*38fd1498Szrj }
401*38fd1498Szrj 
402*38fd1498Szrj /* DOACROSS WAIT operation.  */
403*38fd1498Szrj 
404*38fd1498Szrj void
GOMP_doacross_wait(long first,...)405*38fd1498Szrj GOMP_doacross_wait (long first, ...)
406*38fd1498Szrj {
407*38fd1498Szrj   struct gomp_thread *thr = gomp_thread ();
408*38fd1498Szrj   struct gomp_work_share *ws = thr->ts.work_share;
409*38fd1498Szrj   struct gomp_doacross_work_share *doacross = ws->doacross;
410*38fd1498Szrj   va_list ap;
411*38fd1498Szrj   unsigned long ent;
412*38fd1498Szrj   unsigned int i;
413*38fd1498Szrj 
414*38fd1498Szrj   if (__builtin_expect (doacross == NULL, 0))
415*38fd1498Szrj     {
416*38fd1498Szrj       __sync_synchronize ();
417*38fd1498Szrj       return;
418*38fd1498Szrj     }
419*38fd1498Szrj 
420*38fd1498Szrj   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
421*38fd1498Szrj     {
422*38fd1498Szrj       if (ws->chunk_size == 0)
423*38fd1498Szrj 	{
424*38fd1498Szrj 	  if (first < doacross->boundary)
425*38fd1498Szrj 	    ent = first / (doacross->q + 1);
426*38fd1498Szrj 	  else
427*38fd1498Szrj 	    ent = (first - doacross->boundary) / doacross->q
428*38fd1498Szrj 		  + doacross->t;
429*38fd1498Szrj 	}
430*38fd1498Szrj       else
431*38fd1498Szrj 	ent = first / ws->chunk_size % thr->ts.team->nthreads;
432*38fd1498Szrj     }
433*38fd1498Szrj   else if (ws->sched == GFS_GUIDED)
434*38fd1498Szrj     ent = first;
435*38fd1498Szrj   else
436*38fd1498Szrj     ent = first / doacross->chunk_size;
437*38fd1498Szrj   unsigned long *array = (unsigned long *) (doacross->array
438*38fd1498Szrj 					    + ent * doacross->elt_sz);
439*38fd1498Szrj 
440*38fd1498Szrj   if (__builtin_expect (doacross->flattened, 1))
441*38fd1498Szrj     {
442*38fd1498Szrj       unsigned long flattened
443*38fd1498Szrj 	= (unsigned long) first << doacross->shift_counts[0];
444*38fd1498Szrj       unsigned long cur;
445*38fd1498Szrj 
446*38fd1498Szrj       va_start (ap, first);
447*38fd1498Szrj       for (i = 1; i < doacross->ncounts; i++)
448*38fd1498Szrj 	flattened |= (unsigned long) va_arg (ap, long)
449*38fd1498Szrj 		     << doacross->shift_counts[i];
450*38fd1498Szrj       cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
451*38fd1498Szrj       if (flattened < cur)
452*38fd1498Szrj 	{
453*38fd1498Szrj 	  __atomic_thread_fence (MEMMODEL_RELEASE);
454*38fd1498Szrj 	  va_end (ap);
455*38fd1498Szrj 	  return;
456*38fd1498Szrj 	}
457*38fd1498Szrj       doacross_spin (array, flattened, cur);
458*38fd1498Szrj       __atomic_thread_fence (MEMMODEL_RELEASE);
459*38fd1498Szrj       va_end (ap);
460*38fd1498Szrj       return;
461*38fd1498Szrj     }
462*38fd1498Szrj 
463*38fd1498Szrj   do
464*38fd1498Szrj     {
465*38fd1498Szrj       va_start (ap, first);
466*38fd1498Szrj       for (i = 0; i < doacross->ncounts; i++)
467*38fd1498Szrj 	{
468*38fd1498Szrj 	  unsigned long thisv
469*38fd1498Szrj 	    = (unsigned long) (i ? va_arg (ap, long) : first) + 1;
470*38fd1498Szrj 	  unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
471*38fd1498Szrj 	  if (thisv < cur)
472*38fd1498Szrj 	    {
473*38fd1498Szrj 	      i = doacross->ncounts;
474*38fd1498Szrj 	      break;
475*38fd1498Szrj 	    }
476*38fd1498Szrj 	  if (thisv > cur)
477*38fd1498Szrj 	    break;
478*38fd1498Szrj 	}
479*38fd1498Szrj       va_end (ap);
480*38fd1498Szrj       if (i == doacross->ncounts)
481*38fd1498Szrj 	break;
482*38fd1498Szrj       cpu_relax ();
483*38fd1498Szrj     }
484*38fd1498Szrj   while (1);
485*38fd1498Szrj   __sync_synchronize ();
486*38fd1498Szrj }
487*38fd1498Szrj 
488*38fd1498Szrj typedef unsigned long long gomp_ull;
489*38fd1498Szrj 
490*38fd1498Szrj void
gomp_doacross_ull_init(unsigned ncounts,gomp_ull * counts,gomp_ull chunk_size)491*38fd1498Szrj gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size)
492*38fd1498Szrj {
493*38fd1498Szrj   struct gomp_thread *thr = gomp_thread ();
494*38fd1498Szrj   struct gomp_team *team = thr->ts.team;
495*38fd1498Szrj   struct gomp_work_share *ws = thr->ts.work_share;
496*38fd1498Szrj   unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
497*38fd1498Szrj   unsigned long ent, num_ents, elt_sz, shift_sz;
498*38fd1498Szrj   struct gomp_doacross_work_share *doacross;
499*38fd1498Szrj 
500*38fd1498Szrj   if (team == NULL || team->nthreads == 1)
501*38fd1498Szrj     return;
502*38fd1498Szrj 
503*38fd1498Szrj   for (i = 0; i < ncounts; i++)
504*38fd1498Szrj     {
505*38fd1498Szrj       /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
506*38fd1498Szrj       if (counts[i] == 0)
507*38fd1498Szrj 	return;
508*38fd1498Szrj 
509*38fd1498Szrj       if (num_bits <= MAX_COLLAPSED_BITS)
510*38fd1498Szrj 	{
511*38fd1498Szrj 	  unsigned int this_bits;
512*38fd1498Szrj 	  if (counts[i] == 1)
513*38fd1498Szrj 	    this_bits = 1;
514*38fd1498Szrj 	  else
515*38fd1498Szrj 	    this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__
516*38fd1498Szrj 			- __builtin_clzll (counts[i] - 1);
517*38fd1498Szrj 	  if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
518*38fd1498Szrj 	    {
519*38fd1498Szrj 	      bits[i] = this_bits;
520*38fd1498Szrj 	      num_bits += this_bits;
521*38fd1498Szrj 	    }
522*38fd1498Szrj 	  else
523*38fd1498Szrj 	    num_bits = MAX_COLLAPSED_BITS + 1;
524*38fd1498Szrj 	}
525*38fd1498Szrj     }
526*38fd1498Szrj 
527*38fd1498Szrj   if (ws->sched == GFS_STATIC)
528*38fd1498Szrj     num_ents = team->nthreads;
529*38fd1498Szrj   else if (ws->sched == GFS_GUIDED)
530*38fd1498Szrj     num_ents = counts[0];
531*38fd1498Szrj   else
532*38fd1498Szrj     num_ents = (counts[0] - 1) / chunk_size + 1;
533*38fd1498Szrj   if (num_bits <= MAX_COLLAPSED_BITS)
534*38fd1498Szrj     {
535*38fd1498Szrj       elt_sz = sizeof (unsigned long);
536*38fd1498Szrj       shift_sz = ncounts * sizeof (unsigned int);
537*38fd1498Szrj     }
538*38fd1498Szrj   else
539*38fd1498Szrj     {
540*38fd1498Szrj       if (sizeof (gomp_ull) == sizeof (unsigned long))
541*38fd1498Szrj 	elt_sz = sizeof (gomp_ull) * ncounts;
542*38fd1498Szrj       else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long))
543*38fd1498Szrj 	elt_sz = sizeof (unsigned long) * 2 * ncounts;
544*38fd1498Szrj       else
545*38fd1498Szrj 	abort ();
546*38fd1498Szrj       shift_sz = 0;
547*38fd1498Szrj     }
548*38fd1498Szrj   elt_sz = (elt_sz + 63) & ~63UL;
549*38fd1498Szrj 
550*38fd1498Szrj   doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
551*38fd1498Szrj 			  + shift_sz);
552*38fd1498Szrj   doacross->chunk_size_ull = chunk_size;
553*38fd1498Szrj   doacross->elt_sz = elt_sz;
554*38fd1498Szrj   doacross->ncounts = ncounts;
555*38fd1498Szrj   doacross->flattened = false;
556*38fd1498Szrj   doacross->boundary = 0;
557*38fd1498Szrj   doacross->array = (unsigned char *)
558*38fd1498Szrj 		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
559*38fd1498Szrj 		     & ~(uintptr_t) 63);
560*38fd1498Szrj   if (num_bits <= MAX_COLLAPSED_BITS)
561*38fd1498Szrj     {
562*38fd1498Szrj       unsigned int shift_count = 0;
563*38fd1498Szrj       doacross->flattened = true;
564*38fd1498Szrj       for (i = ncounts; i > 0; i--)
565*38fd1498Szrj 	{
566*38fd1498Szrj 	  doacross->shift_counts[i - 1] = shift_count;
567*38fd1498Szrj 	  shift_count += bits[i - 1];
568*38fd1498Szrj 	}
569*38fd1498Szrj       for (ent = 0; ent < num_ents; ent++)
570*38fd1498Szrj 	*(unsigned long *) (doacross->array + ent * elt_sz) = 0;
571*38fd1498Szrj     }
572*38fd1498Szrj   else
573*38fd1498Szrj     for (ent = 0; ent < num_ents; ent++)
574*38fd1498Szrj       memset (doacross->array + ent * elt_sz, '\0',
575*38fd1498Szrj 	      sizeof (unsigned long) * ncounts);
576*38fd1498Szrj   if (ws->sched == GFS_STATIC && chunk_size == 0)
577*38fd1498Szrj     {
578*38fd1498Szrj       gomp_ull q = counts[0] / num_ents;
579*38fd1498Szrj       gomp_ull t = counts[0] % num_ents;
580*38fd1498Szrj       doacross->boundary_ull = t * (q + 1);
581*38fd1498Szrj       doacross->q_ull = q;
582*38fd1498Szrj       doacross->t = t;
583*38fd1498Szrj     }
584*38fd1498Szrj   ws->doacross = doacross;
585*38fd1498Szrj }
586*38fd1498Szrj 
587*38fd1498Szrj /* DOACROSS POST operation.  */
588*38fd1498Szrj 
589*38fd1498Szrj void
GOMP_doacross_ull_post(gomp_ull * counts)590*38fd1498Szrj GOMP_doacross_ull_post (gomp_ull *counts)
591*38fd1498Szrj {
592*38fd1498Szrj   struct gomp_thread *thr = gomp_thread ();
593*38fd1498Szrj   struct gomp_work_share *ws = thr->ts.work_share;
594*38fd1498Szrj   struct gomp_doacross_work_share *doacross = ws->doacross;
595*38fd1498Szrj   unsigned long ent;
596*38fd1498Szrj   unsigned int i;
597*38fd1498Szrj 
598*38fd1498Szrj   if (__builtin_expect (doacross == NULL, 0))
599*38fd1498Szrj     {
600*38fd1498Szrj       __sync_synchronize ();
601*38fd1498Szrj       return;
602*38fd1498Szrj     }
603*38fd1498Szrj 
604*38fd1498Szrj   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
605*38fd1498Szrj     ent = thr->ts.team_id;
606*38fd1498Szrj   else if (ws->sched == GFS_GUIDED)
607*38fd1498Szrj     ent = counts[0];
608*38fd1498Szrj   else
609*38fd1498Szrj     ent = counts[0] / doacross->chunk_size_ull;
610*38fd1498Szrj 
611*38fd1498Szrj   if (__builtin_expect (doacross->flattened, 1))
612*38fd1498Szrj     {
613*38fd1498Szrj       unsigned long *array = (unsigned long *) (doacross->array
614*38fd1498Szrj 			      + ent * doacross->elt_sz);
615*38fd1498Szrj       gomp_ull flattened
616*38fd1498Szrj 	= counts[0] << doacross->shift_counts[0];
617*38fd1498Szrj 
618*38fd1498Szrj       for (i = 1; i < doacross->ncounts; i++)
619*38fd1498Szrj 	flattened |= counts[i] << doacross->shift_counts[i];
620*38fd1498Szrj       flattened++;
621*38fd1498Szrj       if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
622*38fd1498Szrj 	__atomic_thread_fence (MEMMODEL_RELEASE);
623*38fd1498Szrj       else
624*38fd1498Szrj 	__atomic_store_n (array, flattened, MEMMODEL_RELEASE);
625*38fd1498Szrj       return;
626*38fd1498Szrj     }
627*38fd1498Szrj 
628*38fd1498Szrj   __atomic_thread_fence (MEMMODEL_ACQUIRE);
629*38fd1498Szrj   if (sizeof (gomp_ull) == sizeof (unsigned long))
630*38fd1498Szrj     {
631*38fd1498Szrj       gomp_ull *array = (gomp_ull *) (doacross->array
632*38fd1498Szrj 				      + ent * doacross->elt_sz);
633*38fd1498Szrj 
634*38fd1498Szrj       for (i = doacross->ncounts; i-- > 0; )
635*38fd1498Szrj 	{
636*38fd1498Szrj 	  if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
637*38fd1498Szrj 	    __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
638*38fd1498Szrj 	}
639*38fd1498Szrj     }
640*38fd1498Szrj   else
641*38fd1498Szrj     {
642*38fd1498Szrj       unsigned long *array = (unsigned long *) (doacross->array
643*38fd1498Szrj 						+ ent * doacross->elt_sz);
644*38fd1498Szrj 
645*38fd1498Szrj       for (i = doacross->ncounts; i-- > 0; )
646*38fd1498Szrj 	{
647*38fd1498Szrj 	  gomp_ull cull = counts[i] + 1UL;
648*38fd1498Szrj 	  unsigned long c = (unsigned long) cull;
649*38fd1498Szrj 	  if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED))
650*38fd1498Szrj 	    __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE);
651*38fd1498Szrj 	  c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
652*38fd1498Szrj 	  if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED))
653*38fd1498Szrj 	    __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE);
654*38fd1498Szrj 	}
655*38fd1498Szrj     }
656*38fd1498Szrj }
657*38fd1498Szrj 
658*38fd1498Szrj /* DOACROSS WAIT operation.  */
659*38fd1498Szrj 
660*38fd1498Szrj void
GOMP_doacross_ull_wait(gomp_ull first,...)661*38fd1498Szrj GOMP_doacross_ull_wait (gomp_ull first, ...)
662*38fd1498Szrj {
663*38fd1498Szrj   struct gomp_thread *thr = gomp_thread ();
664*38fd1498Szrj   struct gomp_work_share *ws = thr->ts.work_share;
665*38fd1498Szrj   struct gomp_doacross_work_share *doacross = ws->doacross;
666*38fd1498Szrj   va_list ap;
667*38fd1498Szrj   unsigned long ent;
668*38fd1498Szrj   unsigned int i;
669*38fd1498Szrj 
670*38fd1498Szrj   if (__builtin_expect (doacross == NULL, 0))
671*38fd1498Szrj     {
672*38fd1498Szrj       __sync_synchronize ();
673*38fd1498Szrj       return;
674*38fd1498Szrj     }
675*38fd1498Szrj 
676*38fd1498Szrj   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
677*38fd1498Szrj     {
678*38fd1498Szrj       if (ws->chunk_size_ull == 0)
679*38fd1498Szrj 	{
680*38fd1498Szrj 	  if (first < doacross->boundary_ull)
681*38fd1498Szrj 	    ent = first / (doacross->q_ull + 1);
682*38fd1498Szrj 	  else
683*38fd1498Szrj 	    ent = (first - doacross->boundary_ull) / doacross->q_ull
684*38fd1498Szrj 		  + doacross->t;
685*38fd1498Szrj 	}
686*38fd1498Szrj       else
687*38fd1498Szrj 	ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
688*38fd1498Szrj     }
689*38fd1498Szrj   else if (ws->sched == GFS_GUIDED)
690*38fd1498Szrj     ent = first;
691*38fd1498Szrj   else
692*38fd1498Szrj     ent = first / doacross->chunk_size_ull;
693*38fd1498Szrj 
694*38fd1498Szrj   if (__builtin_expect (doacross->flattened, 1))
695*38fd1498Szrj     {
696*38fd1498Szrj       unsigned long *array = (unsigned long *) (doacross->array
697*38fd1498Szrj 						+ ent * doacross->elt_sz);
698*38fd1498Szrj       gomp_ull flattened = first << doacross->shift_counts[0];
699*38fd1498Szrj       unsigned long cur;
700*38fd1498Szrj 
701*38fd1498Szrj       va_start (ap, first);
702*38fd1498Szrj       for (i = 1; i < doacross->ncounts; i++)
703*38fd1498Szrj 	flattened |= va_arg (ap, gomp_ull)
704*38fd1498Szrj 		     << doacross->shift_counts[i];
705*38fd1498Szrj       cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
706*38fd1498Szrj       if (flattened < cur)
707*38fd1498Szrj 	{
708*38fd1498Szrj 	  __atomic_thread_fence (MEMMODEL_RELEASE);
709*38fd1498Szrj 	  va_end (ap);
710*38fd1498Szrj 	  return;
711*38fd1498Szrj 	}
712*38fd1498Szrj       doacross_spin (array, flattened, cur);
713*38fd1498Szrj       __atomic_thread_fence (MEMMODEL_RELEASE);
714*38fd1498Szrj       va_end (ap);
715*38fd1498Szrj       return;
716*38fd1498Szrj     }
717*38fd1498Szrj 
718*38fd1498Szrj   if (sizeof (gomp_ull) == sizeof (unsigned long))
719*38fd1498Szrj     {
720*38fd1498Szrj       gomp_ull *array = (gomp_ull *) (doacross->array
721*38fd1498Szrj 				      + ent * doacross->elt_sz);
722*38fd1498Szrj       do
723*38fd1498Szrj 	{
724*38fd1498Szrj 	  va_start (ap, first);
725*38fd1498Szrj 	  for (i = 0; i < doacross->ncounts; i++)
726*38fd1498Szrj 	    {
727*38fd1498Szrj 	      gomp_ull thisv
728*38fd1498Szrj 		= (i ? va_arg (ap, gomp_ull) : first) + 1;
729*38fd1498Szrj 	      gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
730*38fd1498Szrj 	      if (thisv < cur)
731*38fd1498Szrj 		{
732*38fd1498Szrj 		  i = doacross->ncounts;
733*38fd1498Szrj 		  break;
734*38fd1498Szrj 		}
735*38fd1498Szrj 	      if (thisv > cur)
736*38fd1498Szrj 		break;
737*38fd1498Szrj 	    }
738*38fd1498Szrj 	  va_end (ap);
739*38fd1498Szrj 	  if (i == doacross->ncounts)
740*38fd1498Szrj 	    break;
741*38fd1498Szrj 	  cpu_relax ();
742*38fd1498Szrj 	}
743*38fd1498Szrj       while (1);
744*38fd1498Szrj     }
745*38fd1498Szrj   else
746*38fd1498Szrj     {
747*38fd1498Szrj       unsigned long *array = (unsigned long *) (doacross->array
748*38fd1498Szrj 						+ ent * doacross->elt_sz);
749*38fd1498Szrj       do
750*38fd1498Szrj 	{
751*38fd1498Szrj 	  va_start (ap, first);
752*38fd1498Szrj 	  for (i = 0; i < doacross->ncounts; i++)
753*38fd1498Szrj 	    {
754*38fd1498Szrj 	      gomp_ull thisv
755*38fd1498Szrj 		= (i ? va_arg (ap, gomp_ull) : first) + 1;
756*38fd1498Szrj 	      unsigned long t
757*38fd1498Szrj 		= thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
758*38fd1498Szrj 	      unsigned long cur
759*38fd1498Szrj 		= __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED);
760*38fd1498Szrj 	      if (t < cur)
761*38fd1498Szrj 		{
762*38fd1498Szrj 		  i = doacross->ncounts;
763*38fd1498Szrj 		  break;
764*38fd1498Szrj 		}
765*38fd1498Szrj 	      if (t > cur)
766*38fd1498Szrj 		break;
767*38fd1498Szrj 	      t = thisv;
768*38fd1498Szrj 	      cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED);
769*38fd1498Szrj 	      if (t < cur)
770*38fd1498Szrj 		{
771*38fd1498Szrj 		  i = doacross->ncounts;
772*38fd1498Szrj 		  break;
773*38fd1498Szrj 		}
774*38fd1498Szrj 	      if (t > cur)
775*38fd1498Szrj 		break;
776*38fd1498Szrj 	    }
777*38fd1498Szrj 	  va_end (ap);
778*38fd1498Szrj 	  if (i == doacross->ncounts)
779*38fd1498Szrj 	    break;
780*38fd1498Szrj 	  cpu_relax ();
781*38fd1498Szrj 	}
782*38fd1498Szrj       while (1);
783*38fd1498Szrj     }
784*38fd1498Szrj   __sync_synchronize ();
785*38fd1498Szrj }
786