1*38fd1498Szrj /* Copyright (C) 2005-2018 Free Software Foundation, Inc.
2*38fd1498Szrj Contributed by Richard Henderson <rth@redhat.com>.
3*38fd1498Szrj
4*38fd1498Szrj This file is part of the GNU Offloading and Multi Processing Library
5*38fd1498Szrj (libgomp).
6*38fd1498Szrj
7*38fd1498Szrj Libgomp is free software; you can redistribute it and/or modify it
8*38fd1498Szrj under the terms of the GNU General Public License as published by
9*38fd1498Szrj the Free Software Foundation; either version 3, or (at your option)
10*38fd1498Szrj any later version.
11*38fd1498Szrj
12*38fd1498Szrj Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13*38fd1498Szrj WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14*38fd1498Szrj FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15*38fd1498Szrj more details.
16*38fd1498Szrj
17*38fd1498Szrj Under Section 7 of GPL version 3, you are granted additional
18*38fd1498Szrj permissions described in the GCC Runtime Library Exception, version
19*38fd1498Szrj 3.1, as published by the Free Software Foundation.
20*38fd1498Szrj
21*38fd1498Szrj You should have received a copy of the GNU General Public License and
22*38fd1498Szrj a copy of the GCC Runtime Library Exception along with this program;
23*38fd1498Szrj see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24*38fd1498Szrj <http://www.gnu.org/licenses/>. */
25*38fd1498Szrj
26*38fd1498Szrj /* This file handles the ORDERED construct. */
27*38fd1498Szrj
28*38fd1498Szrj #include "libgomp.h"
29*38fd1498Szrj #include <stdarg.h>
30*38fd1498Szrj #include <string.h>
31*38fd1498Szrj #include "doacross.h"
32*38fd1498Szrj
33*38fd1498Szrj
34*38fd1498Szrj /* This function is called when first allocating an iteration block. That
35*38fd1498Szrj is, the thread is not currently on the queue. The work-share lock must
36*38fd1498Szrj be held on entry. */
37*38fd1498Szrj
38*38fd1498Szrj void
gomp_ordered_first(void)39*38fd1498Szrj gomp_ordered_first (void)
40*38fd1498Szrj {
41*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
42*38fd1498Szrj struct gomp_team *team = thr->ts.team;
43*38fd1498Szrj struct gomp_work_share *ws = thr->ts.work_share;
44*38fd1498Szrj unsigned index;
45*38fd1498Szrj
46*38fd1498Szrj /* Work share constructs can be orphaned. */
47*38fd1498Szrj if (team == NULL || team->nthreads == 1)
48*38fd1498Szrj return;
49*38fd1498Szrj
50*38fd1498Szrj index = ws->ordered_cur + ws->ordered_num_used;
51*38fd1498Szrj if (index >= team->nthreads)
52*38fd1498Szrj index -= team->nthreads;
53*38fd1498Szrj ws->ordered_team_ids[index] = thr->ts.team_id;
54*38fd1498Szrj
55*38fd1498Szrj /* If this is the first and only thread in the queue, then there is
56*38fd1498Szrj no one to release us when we get to our ordered section. Post to
57*38fd1498Szrj our own release queue now so that we won't block later. */
58*38fd1498Szrj if (ws->ordered_num_used++ == 0)
59*38fd1498Szrj gomp_sem_post (team->ordered_release[thr->ts.team_id]);
60*38fd1498Szrj }
61*38fd1498Szrj
62*38fd1498Szrj /* This function is called when completing the last iteration block. That
63*38fd1498Szrj is, there are no more iterations to perform and so the thread should be
64*38fd1498Szrj removed from the queue entirely. Because of the way ORDERED blocks are
65*38fd1498Szrj managed, it follows that we currently own access to the ORDERED block,
66*38fd1498Szrj and should now pass it on to the next thread. The work-share lock must
67*38fd1498Szrj be held on entry. */
68*38fd1498Szrj
69*38fd1498Szrj void
gomp_ordered_last(void)70*38fd1498Szrj gomp_ordered_last (void)
71*38fd1498Szrj {
72*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
73*38fd1498Szrj struct gomp_team *team = thr->ts.team;
74*38fd1498Szrj struct gomp_work_share *ws = thr->ts.work_share;
75*38fd1498Szrj unsigned next_id;
76*38fd1498Szrj
77*38fd1498Szrj /* Work share constructs can be orphaned. */
78*38fd1498Szrj if (team == NULL || team->nthreads == 1)
79*38fd1498Szrj return;
80*38fd1498Szrj
81*38fd1498Szrj /* We're no longer the owner. */
82*38fd1498Szrj ws->ordered_owner = -1;
83*38fd1498Szrj
84*38fd1498Szrj /* If we're not the last thread in the queue, then wake the next. */
85*38fd1498Szrj if (--ws->ordered_num_used > 0)
86*38fd1498Szrj {
87*38fd1498Szrj unsigned next = ws->ordered_cur + 1;
88*38fd1498Szrj if (next == team->nthreads)
89*38fd1498Szrj next = 0;
90*38fd1498Szrj ws->ordered_cur = next;
91*38fd1498Szrj
92*38fd1498Szrj next_id = ws->ordered_team_ids[next];
93*38fd1498Szrj gomp_sem_post (team->ordered_release[next_id]);
94*38fd1498Szrj }
95*38fd1498Szrj }
96*38fd1498Szrj
97*38fd1498Szrj
98*38fd1498Szrj /* This function is called when allocating a subsequent allocation block.
99*38fd1498Szrj That is, we're done with the current iteration block and we're allocating
100*38fd1498Szrj another. This is the logical combination of a call to gomp_ordered_last
101*38fd1498Szrj followed by a call to gomp_ordered_first. The work-share lock must be
102*38fd1498Szrj held on entry. */
103*38fd1498Szrj
104*38fd1498Szrj void
gomp_ordered_next(void)105*38fd1498Szrj gomp_ordered_next (void)
106*38fd1498Szrj {
107*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
108*38fd1498Szrj struct gomp_team *team = thr->ts.team;
109*38fd1498Szrj struct gomp_work_share *ws = thr->ts.work_share;
110*38fd1498Szrj unsigned index, next_id;
111*38fd1498Szrj
112*38fd1498Szrj /* Work share constructs can be orphaned. */
113*38fd1498Szrj if (team == NULL || team->nthreads == 1)
114*38fd1498Szrj return;
115*38fd1498Szrj
116*38fd1498Szrj /* We're no longer the owner. */
117*38fd1498Szrj ws->ordered_owner = -1;
118*38fd1498Szrj
119*38fd1498Szrj /* If there's only one thread in the queue, that must be us. */
120*38fd1498Szrj if (ws->ordered_num_used == 1)
121*38fd1498Szrj {
122*38fd1498Szrj /* We have a similar situation as in gomp_ordered_first
123*38fd1498Szrj where we need to post to our own release semaphore. */
124*38fd1498Szrj gomp_sem_post (team->ordered_release[thr->ts.team_id]);
125*38fd1498Szrj return;
126*38fd1498Szrj }
127*38fd1498Szrj
128*38fd1498Szrj /* If the queue is entirely full, then we move ourself to the end of
129*38fd1498Szrj the queue merely by incrementing ordered_cur. Only if it's not
130*38fd1498Szrj full do we have to write our id. */
131*38fd1498Szrj if (ws->ordered_num_used < team->nthreads)
132*38fd1498Szrj {
133*38fd1498Szrj index = ws->ordered_cur + ws->ordered_num_used;
134*38fd1498Szrj if (index >= team->nthreads)
135*38fd1498Szrj index -= team->nthreads;
136*38fd1498Szrj ws->ordered_team_ids[index] = thr->ts.team_id;
137*38fd1498Szrj }
138*38fd1498Szrj
139*38fd1498Szrj index = ws->ordered_cur + 1;
140*38fd1498Szrj if (index == team->nthreads)
141*38fd1498Szrj index = 0;
142*38fd1498Szrj ws->ordered_cur = index;
143*38fd1498Szrj
144*38fd1498Szrj next_id = ws->ordered_team_ids[index];
145*38fd1498Szrj gomp_sem_post (team->ordered_release[next_id]);
146*38fd1498Szrj }
147*38fd1498Szrj
148*38fd1498Szrj
149*38fd1498Szrj /* This function is called when a statically scheduled loop is first
150*38fd1498Szrj being created. */
151*38fd1498Szrj
152*38fd1498Szrj void
gomp_ordered_static_init(void)153*38fd1498Szrj gomp_ordered_static_init (void)
154*38fd1498Szrj {
155*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
156*38fd1498Szrj struct gomp_team *team = thr->ts.team;
157*38fd1498Szrj
158*38fd1498Szrj if (team == NULL || team->nthreads == 1)
159*38fd1498Szrj return;
160*38fd1498Szrj
161*38fd1498Szrj gomp_sem_post (team->ordered_release[0]);
162*38fd1498Szrj }
163*38fd1498Szrj
164*38fd1498Szrj /* This function is called when a statically scheduled loop is moving to
165*38fd1498Szrj the next allocation block. Static schedules are not first come first
166*38fd1498Szrj served like the others, so we're to move to the numerically next thread,
167*38fd1498Szrj not the next thread on a list. The work-share lock should *not* be held
168*38fd1498Szrj on entry. */
169*38fd1498Szrj
170*38fd1498Szrj void
gomp_ordered_static_next(void)171*38fd1498Szrj gomp_ordered_static_next (void)
172*38fd1498Szrj {
173*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
174*38fd1498Szrj struct gomp_team *team = thr->ts.team;
175*38fd1498Szrj struct gomp_work_share *ws = thr->ts.work_share;
176*38fd1498Szrj unsigned id = thr->ts.team_id;
177*38fd1498Szrj
178*38fd1498Szrj if (team == NULL || team->nthreads == 1)
179*38fd1498Szrj return;
180*38fd1498Szrj
181*38fd1498Szrj ws->ordered_owner = -1;
182*38fd1498Szrj
183*38fd1498Szrj /* This thread currently owns the lock. Increment the owner. */
184*38fd1498Szrj if (++id == team->nthreads)
185*38fd1498Szrj id = 0;
186*38fd1498Szrj ws->ordered_team_ids[0] = id;
187*38fd1498Szrj gomp_sem_post (team->ordered_release[id]);
188*38fd1498Szrj }
189*38fd1498Szrj
190*38fd1498Szrj /* This function is called when we need to assert that the thread owns the
191*38fd1498Szrj ordered section. Due to the problem of posted-but-not-waited semaphores,
192*38fd1498Szrj this needs to happen before completing a loop iteration. */
193*38fd1498Szrj
194*38fd1498Szrj void
gomp_ordered_sync(void)195*38fd1498Szrj gomp_ordered_sync (void)
196*38fd1498Szrj {
197*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
198*38fd1498Szrj struct gomp_team *team = thr->ts.team;
199*38fd1498Szrj struct gomp_work_share *ws = thr->ts.work_share;
200*38fd1498Szrj
201*38fd1498Szrj /* Work share constructs can be orphaned. But this clearly means that
202*38fd1498Szrj we are the only thread, and so we automatically own the section. */
203*38fd1498Szrj if (team == NULL || team->nthreads == 1)
204*38fd1498Szrj return;
205*38fd1498Szrj
206*38fd1498Szrj /* ??? I believe it to be safe to access this data without taking the
207*38fd1498Szrj ws->lock. The only presumed race condition is with the previous
208*38fd1498Szrj thread on the queue incrementing ordered_cur such that it points
209*38fd1498Szrj to us, concurrently with our check below. But our team_id is
210*38fd1498Szrj already present in the queue, and the other thread will always
211*38fd1498Szrj post to our release semaphore. So the two cases are that we will
212*38fd1498Szrj either win the race an momentarily block on the semaphore, or lose
213*38fd1498Szrj the race and find the semaphore already unlocked and so not block.
214*38fd1498Szrj Either way we get correct results.
215*38fd1498Szrj However, there is an implicit flush on entry to an ordered region,
216*38fd1498Szrj so we do need to have a barrier here. If we were taking a lock
217*38fd1498Szrj this could be MEMMODEL_RELEASE since the acquire would be coverd
218*38fd1498Szrj by the lock. */
219*38fd1498Szrj
220*38fd1498Szrj __atomic_thread_fence (MEMMODEL_ACQ_REL);
221*38fd1498Szrj if (ws->ordered_owner != thr->ts.team_id)
222*38fd1498Szrj {
223*38fd1498Szrj gomp_sem_wait (team->ordered_release[thr->ts.team_id]);
224*38fd1498Szrj ws->ordered_owner = thr->ts.team_id;
225*38fd1498Szrj }
226*38fd1498Szrj }
227*38fd1498Szrj
228*38fd1498Szrj /* This function is called by user code when encountering the start of an
229*38fd1498Szrj ORDERED block. We must check to see if the current thread is at the
230*38fd1498Szrj head of the queue, and if not, block. */
231*38fd1498Szrj
232*38fd1498Szrj #ifdef HAVE_ATTRIBUTE_ALIAS
233*38fd1498Szrj extern void GOMP_ordered_start (void)
234*38fd1498Szrj __attribute__((alias ("gomp_ordered_sync")));
235*38fd1498Szrj #else
236*38fd1498Szrj void
GOMP_ordered_start(void)237*38fd1498Szrj GOMP_ordered_start (void)
238*38fd1498Szrj {
239*38fd1498Szrj gomp_ordered_sync ();
240*38fd1498Szrj }
241*38fd1498Szrj #endif
242*38fd1498Szrj
243*38fd1498Szrj /* This function is called by user code when encountering the end of an
244*38fd1498Szrj ORDERED block. With the current ORDERED implementation there's nothing
245*38fd1498Szrj for us to do.
246*38fd1498Szrj
247*38fd1498Szrj However, the current implementation has a flaw in that it does not allow
248*38fd1498Szrj the next thread into the ORDERED section immediately after the current
249*38fd1498Szrj thread exits the ORDERED section in its last iteration. The existance
250*38fd1498Szrj of this function allows the implementation to change. */
251*38fd1498Szrj
252*38fd1498Szrj void
GOMP_ordered_end(void)253*38fd1498Szrj GOMP_ordered_end (void)
254*38fd1498Szrj {
255*38fd1498Szrj }
256*38fd1498Szrj
257*38fd1498Szrj /* DOACROSS initialization. */
258*38fd1498Szrj
259*38fd1498Szrj #define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
260*38fd1498Szrj
261*38fd1498Szrj void
gomp_doacross_init(unsigned ncounts,long * counts,long chunk_size)262*38fd1498Szrj gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size)
263*38fd1498Szrj {
264*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
265*38fd1498Szrj struct gomp_team *team = thr->ts.team;
266*38fd1498Szrj struct gomp_work_share *ws = thr->ts.work_share;
267*38fd1498Szrj unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
268*38fd1498Szrj unsigned long ent, num_ents, elt_sz, shift_sz;
269*38fd1498Szrj struct gomp_doacross_work_share *doacross;
270*38fd1498Szrj
271*38fd1498Szrj if (team == NULL || team->nthreads == 1)
272*38fd1498Szrj return;
273*38fd1498Szrj
274*38fd1498Szrj for (i = 0; i < ncounts; i++)
275*38fd1498Szrj {
276*38fd1498Szrj /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */
277*38fd1498Szrj if (counts[i] == 0)
278*38fd1498Szrj return;
279*38fd1498Szrj
280*38fd1498Szrj if (num_bits <= MAX_COLLAPSED_BITS)
281*38fd1498Szrj {
282*38fd1498Szrj unsigned int this_bits;
283*38fd1498Szrj if (counts[i] == 1)
284*38fd1498Szrj this_bits = 1;
285*38fd1498Szrj else
286*38fd1498Szrj this_bits = __SIZEOF_LONG__ * __CHAR_BIT__
287*38fd1498Szrj - __builtin_clzl (counts[i] - 1);
288*38fd1498Szrj if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
289*38fd1498Szrj {
290*38fd1498Szrj bits[i] = this_bits;
291*38fd1498Szrj num_bits += this_bits;
292*38fd1498Szrj }
293*38fd1498Szrj else
294*38fd1498Szrj num_bits = MAX_COLLAPSED_BITS + 1;
295*38fd1498Szrj }
296*38fd1498Szrj }
297*38fd1498Szrj
298*38fd1498Szrj if (ws->sched == GFS_STATIC)
299*38fd1498Szrj num_ents = team->nthreads;
300*38fd1498Szrj else if (ws->sched == GFS_GUIDED)
301*38fd1498Szrj num_ents = counts[0];
302*38fd1498Szrj else
303*38fd1498Szrj num_ents = (counts[0] - 1) / chunk_size + 1;
304*38fd1498Szrj if (num_bits <= MAX_COLLAPSED_BITS)
305*38fd1498Szrj {
306*38fd1498Szrj elt_sz = sizeof (unsigned long);
307*38fd1498Szrj shift_sz = ncounts * sizeof (unsigned int);
308*38fd1498Szrj }
309*38fd1498Szrj else
310*38fd1498Szrj {
311*38fd1498Szrj elt_sz = sizeof (unsigned long) * ncounts;
312*38fd1498Szrj shift_sz = 0;
313*38fd1498Szrj }
314*38fd1498Szrj elt_sz = (elt_sz + 63) & ~63UL;
315*38fd1498Szrj
316*38fd1498Szrj doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
317*38fd1498Szrj + shift_sz);
318*38fd1498Szrj doacross->chunk_size = chunk_size;
319*38fd1498Szrj doacross->elt_sz = elt_sz;
320*38fd1498Szrj doacross->ncounts = ncounts;
321*38fd1498Szrj doacross->flattened = false;
322*38fd1498Szrj doacross->array = (unsigned char *)
323*38fd1498Szrj ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
324*38fd1498Szrj & ~(uintptr_t) 63);
325*38fd1498Szrj if (num_bits <= MAX_COLLAPSED_BITS)
326*38fd1498Szrj {
327*38fd1498Szrj unsigned int shift_count = 0;
328*38fd1498Szrj doacross->flattened = true;
329*38fd1498Szrj for (i = ncounts; i > 0; i--)
330*38fd1498Szrj {
331*38fd1498Szrj doacross->shift_counts[i - 1] = shift_count;
332*38fd1498Szrj shift_count += bits[i - 1];
333*38fd1498Szrj }
334*38fd1498Szrj for (ent = 0; ent < num_ents; ent++)
335*38fd1498Szrj *(unsigned long *) (doacross->array + ent * elt_sz) = 0;
336*38fd1498Szrj }
337*38fd1498Szrj else
338*38fd1498Szrj for (ent = 0; ent < num_ents; ent++)
339*38fd1498Szrj memset (doacross->array + ent * elt_sz, '\0',
340*38fd1498Szrj sizeof (unsigned long) * ncounts);
341*38fd1498Szrj if (ws->sched == GFS_STATIC && chunk_size == 0)
342*38fd1498Szrj {
343*38fd1498Szrj unsigned long q = counts[0] / num_ents;
344*38fd1498Szrj unsigned long t = counts[0] % num_ents;
345*38fd1498Szrj doacross->boundary = t * (q + 1);
346*38fd1498Szrj doacross->q = q;
347*38fd1498Szrj doacross->t = t;
348*38fd1498Szrj }
349*38fd1498Szrj ws->doacross = doacross;
350*38fd1498Szrj }
351*38fd1498Szrj
352*38fd1498Szrj /* DOACROSS POST operation. */
353*38fd1498Szrj
354*38fd1498Szrj void
GOMP_doacross_post(long * counts)355*38fd1498Szrj GOMP_doacross_post (long *counts)
356*38fd1498Szrj {
357*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
358*38fd1498Szrj struct gomp_work_share *ws = thr->ts.work_share;
359*38fd1498Szrj struct gomp_doacross_work_share *doacross = ws->doacross;
360*38fd1498Szrj unsigned long ent;
361*38fd1498Szrj unsigned int i;
362*38fd1498Szrj
363*38fd1498Szrj if (__builtin_expect (doacross == NULL, 0))
364*38fd1498Szrj {
365*38fd1498Szrj __sync_synchronize ();
366*38fd1498Szrj return;
367*38fd1498Szrj }
368*38fd1498Szrj
369*38fd1498Szrj if (__builtin_expect (ws->sched == GFS_STATIC, 1))
370*38fd1498Szrj ent = thr->ts.team_id;
371*38fd1498Szrj else if (ws->sched == GFS_GUIDED)
372*38fd1498Szrj ent = counts[0];
373*38fd1498Szrj else
374*38fd1498Szrj ent = counts[0] / doacross->chunk_size;
375*38fd1498Szrj unsigned long *array = (unsigned long *) (doacross->array
376*38fd1498Szrj + ent * doacross->elt_sz);
377*38fd1498Szrj
378*38fd1498Szrj if (__builtin_expect (doacross->flattened, 1))
379*38fd1498Szrj {
380*38fd1498Szrj unsigned long flattened
381*38fd1498Szrj = (unsigned long) counts[0] << doacross->shift_counts[0];
382*38fd1498Szrj
383*38fd1498Szrj for (i = 1; i < doacross->ncounts; i++)
384*38fd1498Szrj flattened |= (unsigned long) counts[i]
385*38fd1498Szrj << doacross->shift_counts[i];
386*38fd1498Szrj flattened++;
387*38fd1498Szrj if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
388*38fd1498Szrj __atomic_thread_fence (MEMMODEL_RELEASE);
389*38fd1498Szrj else
390*38fd1498Szrj __atomic_store_n (array, flattened, MEMMODEL_RELEASE);
391*38fd1498Szrj return;
392*38fd1498Szrj }
393*38fd1498Szrj
394*38fd1498Szrj __atomic_thread_fence (MEMMODEL_ACQUIRE);
395*38fd1498Szrj for (i = doacross->ncounts; i-- > 0; )
396*38fd1498Szrj {
397*38fd1498Szrj if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
398*38fd1498Szrj __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
399*38fd1498Szrj }
400*38fd1498Szrj }
401*38fd1498Szrj
402*38fd1498Szrj /* DOACROSS WAIT operation. */
403*38fd1498Szrj
404*38fd1498Szrj void
GOMP_doacross_wait(long first,...)405*38fd1498Szrj GOMP_doacross_wait (long first, ...)
406*38fd1498Szrj {
407*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
408*38fd1498Szrj struct gomp_work_share *ws = thr->ts.work_share;
409*38fd1498Szrj struct gomp_doacross_work_share *doacross = ws->doacross;
410*38fd1498Szrj va_list ap;
411*38fd1498Szrj unsigned long ent;
412*38fd1498Szrj unsigned int i;
413*38fd1498Szrj
414*38fd1498Szrj if (__builtin_expect (doacross == NULL, 0))
415*38fd1498Szrj {
416*38fd1498Szrj __sync_synchronize ();
417*38fd1498Szrj return;
418*38fd1498Szrj }
419*38fd1498Szrj
420*38fd1498Szrj if (__builtin_expect (ws->sched == GFS_STATIC, 1))
421*38fd1498Szrj {
422*38fd1498Szrj if (ws->chunk_size == 0)
423*38fd1498Szrj {
424*38fd1498Szrj if (first < doacross->boundary)
425*38fd1498Szrj ent = first / (doacross->q + 1);
426*38fd1498Szrj else
427*38fd1498Szrj ent = (first - doacross->boundary) / doacross->q
428*38fd1498Szrj + doacross->t;
429*38fd1498Szrj }
430*38fd1498Szrj else
431*38fd1498Szrj ent = first / ws->chunk_size % thr->ts.team->nthreads;
432*38fd1498Szrj }
433*38fd1498Szrj else if (ws->sched == GFS_GUIDED)
434*38fd1498Szrj ent = first;
435*38fd1498Szrj else
436*38fd1498Szrj ent = first / doacross->chunk_size;
437*38fd1498Szrj unsigned long *array = (unsigned long *) (doacross->array
438*38fd1498Szrj + ent * doacross->elt_sz);
439*38fd1498Szrj
440*38fd1498Szrj if (__builtin_expect (doacross->flattened, 1))
441*38fd1498Szrj {
442*38fd1498Szrj unsigned long flattened
443*38fd1498Szrj = (unsigned long) first << doacross->shift_counts[0];
444*38fd1498Szrj unsigned long cur;
445*38fd1498Szrj
446*38fd1498Szrj va_start (ap, first);
447*38fd1498Szrj for (i = 1; i < doacross->ncounts; i++)
448*38fd1498Szrj flattened |= (unsigned long) va_arg (ap, long)
449*38fd1498Szrj << doacross->shift_counts[i];
450*38fd1498Szrj cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
451*38fd1498Szrj if (flattened < cur)
452*38fd1498Szrj {
453*38fd1498Szrj __atomic_thread_fence (MEMMODEL_RELEASE);
454*38fd1498Szrj va_end (ap);
455*38fd1498Szrj return;
456*38fd1498Szrj }
457*38fd1498Szrj doacross_spin (array, flattened, cur);
458*38fd1498Szrj __atomic_thread_fence (MEMMODEL_RELEASE);
459*38fd1498Szrj va_end (ap);
460*38fd1498Szrj return;
461*38fd1498Szrj }
462*38fd1498Szrj
463*38fd1498Szrj do
464*38fd1498Szrj {
465*38fd1498Szrj va_start (ap, first);
466*38fd1498Szrj for (i = 0; i < doacross->ncounts; i++)
467*38fd1498Szrj {
468*38fd1498Szrj unsigned long thisv
469*38fd1498Szrj = (unsigned long) (i ? va_arg (ap, long) : first) + 1;
470*38fd1498Szrj unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
471*38fd1498Szrj if (thisv < cur)
472*38fd1498Szrj {
473*38fd1498Szrj i = doacross->ncounts;
474*38fd1498Szrj break;
475*38fd1498Szrj }
476*38fd1498Szrj if (thisv > cur)
477*38fd1498Szrj break;
478*38fd1498Szrj }
479*38fd1498Szrj va_end (ap);
480*38fd1498Szrj if (i == doacross->ncounts)
481*38fd1498Szrj break;
482*38fd1498Szrj cpu_relax ();
483*38fd1498Szrj }
484*38fd1498Szrj while (1);
485*38fd1498Szrj __sync_synchronize ();
486*38fd1498Szrj }
487*38fd1498Szrj
488*38fd1498Szrj typedef unsigned long long gomp_ull;
489*38fd1498Szrj
490*38fd1498Szrj void
gomp_doacross_ull_init(unsigned ncounts,gomp_ull * counts,gomp_ull chunk_size)491*38fd1498Szrj gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size)
492*38fd1498Szrj {
493*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
494*38fd1498Szrj struct gomp_team *team = thr->ts.team;
495*38fd1498Szrj struct gomp_work_share *ws = thr->ts.work_share;
496*38fd1498Szrj unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
497*38fd1498Szrj unsigned long ent, num_ents, elt_sz, shift_sz;
498*38fd1498Szrj struct gomp_doacross_work_share *doacross;
499*38fd1498Szrj
500*38fd1498Szrj if (team == NULL || team->nthreads == 1)
501*38fd1498Szrj return;
502*38fd1498Szrj
503*38fd1498Szrj for (i = 0; i < ncounts; i++)
504*38fd1498Szrj {
505*38fd1498Szrj /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */
506*38fd1498Szrj if (counts[i] == 0)
507*38fd1498Szrj return;
508*38fd1498Szrj
509*38fd1498Szrj if (num_bits <= MAX_COLLAPSED_BITS)
510*38fd1498Szrj {
511*38fd1498Szrj unsigned int this_bits;
512*38fd1498Szrj if (counts[i] == 1)
513*38fd1498Szrj this_bits = 1;
514*38fd1498Szrj else
515*38fd1498Szrj this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__
516*38fd1498Szrj - __builtin_clzll (counts[i] - 1);
517*38fd1498Szrj if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
518*38fd1498Szrj {
519*38fd1498Szrj bits[i] = this_bits;
520*38fd1498Szrj num_bits += this_bits;
521*38fd1498Szrj }
522*38fd1498Szrj else
523*38fd1498Szrj num_bits = MAX_COLLAPSED_BITS + 1;
524*38fd1498Szrj }
525*38fd1498Szrj }
526*38fd1498Szrj
527*38fd1498Szrj if (ws->sched == GFS_STATIC)
528*38fd1498Szrj num_ents = team->nthreads;
529*38fd1498Szrj else if (ws->sched == GFS_GUIDED)
530*38fd1498Szrj num_ents = counts[0];
531*38fd1498Szrj else
532*38fd1498Szrj num_ents = (counts[0] - 1) / chunk_size + 1;
533*38fd1498Szrj if (num_bits <= MAX_COLLAPSED_BITS)
534*38fd1498Szrj {
535*38fd1498Szrj elt_sz = sizeof (unsigned long);
536*38fd1498Szrj shift_sz = ncounts * sizeof (unsigned int);
537*38fd1498Szrj }
538*38fd1498Szrj else
539*38fd1498Szrj {
540*38fd1498Szrj if (sizeof (gomp_ull) == sizeof (unsigned long))
541*38fd1498Szrj elt_sz = sizeof (gomp_ull) * ncounts;
542*38fd1498Szrj else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long))
543*38fd1498Szrj elt_sz = sizeof (unsigned long) * 2 * ncounts;
544*38fd1498Szrj else
545*38fd1498Szrj abort ();
546*38fd1498Szrj shift_sz = 0;
547*38fd1498Szrj }
548*38fd1498Szrj elt_sz = (elt_sz + 63) & ~63UL;
549*38fd1498Szrj
550*38fd1498Szrj doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
551*38fd1498Szrj + shift_sz);
552*38fd1498Szrj doacross->chunk_size_ull = chunk_size;
553*38fd1498Szrj doacross->elt_sz = elt_sz;
554*38fd1498Szrj doacross->ncounts = ncounts;
555*38fd1498Szrj doacross->flattened = false;
556*38fd1498Szrj doacross->boundary = 0;
557*38fd1498Szrj doacross->array = (unsigned char *)
558*38fd1498Szrj ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
559*38fd1498Szrj & ~(uintptr_t) 63);
560*38fd1498Szrj if (num_bits <= MAX_COLLAPSED_BITS)
561*38fd1498Szrj {
562*38fd1498Szrj unsigned int shift_count = 0;
563*38fd1498Szrj doacross->flattened = true;
564*38fd1498Szrj for (i = ncounts; i > 0; i--)
565*38fd1498Szrj {
566*38fd1498Szrj doacross->shift_counts[i - 1] = shift_count;
567*38fd1498Szrj shift_count += bits[i - 1];
568*38fd1498Szrj }
569*38fd1498Szrj for (ent = 0; ent < num_ents; ent++)
570*38fd1498Szrj *(unsigned long *) (doacross->array + ent * elt_sz) = 0;
571*38fd1498Szrj }
572*38fd1498Szrj else
573*38fd1498Szrj for (ent = 0; ent < num_ents; ent++)
574*38fd1498Szrj memset (doacross->array + ent * elt_sz, '\0',
575*38fd1498Szrj sizeof (unsigned long) * ncounts);
576*38fd1498Szrj if (ws->sched == GFS_STATIC && chunk_size == 0)
577*38fd1498Szrj {
578*38fd1498Szrj gomp_ull q = counts[0] / num_ents;
579*38fd1498Szrj gomp_ull t = counts[0] % num_ents;
580*38fd1498Szrj doacross->boundary_ull = t * (q + 1);
581*38fd1498Szrj doacross->q_ull = q;
582*38fd1498Szrj doacross->t = t;
583*38fd1498Szrj }
584*38fd1498Szrj ws->doacross = doacross;
585*38fd1498Szrj }
586*38fd1498Szrj
587*38fd1498Szrj /* DOACROSS POST operation. */
588*38fd1498Szrj
589*38fd1498Szrj void
GOMP_doacross_ull_post(gomp_ull * counts)590*38fd1498Szrj GOMP_doacross_ull_post (gomp_ull *counts)
591*38fd1498Szrj {
592*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
593*38fd1498Szrj struct gomp_work_share *ws = thr->ts.work_share;
594*38fd1498Szrj struct gomp_doacross_work_share *doacross = ws->doacross;
595*38fd1498Szrj unsigned long ent;
596*38fd1498Szrj unsigned int i;
597*38fd1498Szrj
598*38fd1498Szrj if (__builtin_expect (doacross == NULL, 0))
599*38fd1498Szrj {
600*38fd1498Szrj __sync_synchronize ();
601*38fd1498Szrj return;
602*38fd1498Szrj }
603*38fd1498Szrj
604*38fd1498Szrj if (__builtin_expect (ws->sched == GFS_STATIC, 1))
605*38fd1498Szrj ent = thr->ts.team_id;
606*38fd1498Szrj else if (ws->sched == GFS_GUIDED)
607*38fd1498Szrj ent = counts[0];
608*38fd1498Szrj else
609*38fd1498Szrj ent = counts[0] / doacross->chunk_size_ull;
610*38fd1498Szrj
611*38fd1498Szrj if (__builtin_expect (doacross->flattened, 1))
612*38fd1498Szrj {
613*38fd1498Szrj unsigned long *array = (unsigned long *) (doacross->array
614*38fd1498Szrj + ent * doacross->elt_sz);
615*38fd1498Szrj gomp_ull flattened
616*38fd1498Szrj = counts[0] << doacross->shift_counts[0];
617*38fd1498Szrj
618*38fd1498Szrj for (i = 1; i < doacross->ncounts; i++)
619*38fd1498Szrj flattened |= counts[i] << doacross->shift_counts[i];
620*38fd1498Szrj flattened++;
621*38fd1498Szrj if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
622*38fd1498Szrj __atomic_thread_fence (MEMMODEL_RELEASE);
623*38fd1498Szrj else
624*38fd1498Szrj __atomic_store_n (array, flattened, MEMMODEL_RELEASE);
625*38fd1498Szrj return;
626*38fd1498Szrj }
627*38fd1498Szrj
628*38fd1498Szrj __atomic_thread_fence (MEMMODEL_ACQUIRE);
629*38fd1498Szrj if (sizeof (gomp_ull) == sizeof (unsigned long))
630*38fd1498Szrj {
631*38fd1498Szrj gomp_ull *array = (gomp_ull *) (doacross->array
632*38fd1498Szrj + ent * doacross->elt_sz);
633*38fd1498Szrj
634*38fd1498Szrj for (i = doacross->ncounts; i-- > 0; )
635*38fd1498Szrj {
636*38fd1498Szrj if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
637*38fd1498Szrj __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
638*38fd1498Szrj }
639*38fd1498Szrj }
640*38fd1498Szrj else
641*38fd1498Szrj {
642*38fd1498Szrj unsigned long *array = (unsigned long *) (doacross->array
643*38fd1498Szrj + ent * doacross->elt_sz);
644*38fd1498Szrj
645*38fd1498Szrj for (i = doacross->ncounts; i-- > 0; )
646*38fd1498Szrj {
647*38fd1498Szrj gomp_ull cull = counts[i] + 1UL;
648*38fd1498Szrj unsigned long c = (unsigned long) cull;
649*38fd1498Szrj if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED))
650*38fd1498Szrj __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE);
651*38fd1498Szrj c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
652*38fd1498Szrj if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED))
653*38fd1498Szrj __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE);
654*38fd1498Szrj }
655*38fd1498Szrj }
656*38fd1498Szrj }
657*38fd1498Szrj
658*38fd1498Szrj /* DOACROSS WAIT operation. */
659*38fd1498Szrj
660*38fd1498Szrj void
GOMP_doacross_ull_wait(gomp_ull first,...)661*38fd1498Szrj GOMP_doacross_ull_wait (gomp_ull first, ...)
662*38fd1498Szrj {
663*38fd1498Szrj struct gomp_thread *thr = gomp_thread ();
664*38fd1498Szrj struct gomp_work_share *ws = thr->ts.work_share;
665*38fd1498Szrj struct gomp_doacross_work_share *doacross = ws->doacross;
666*38fd1498Szrj va_list ap;
667*38fd1498Szrj unsigned long ent;
668*38fd1498Szrj unsigned int i;
669*38fd1498Szrj
670*38fd1498Szrj if (__builtin_expect (doacross == NULL, 0))
671*38fd1498Szrj {
672*38fd1498Szrj __sync_synchronize ();
673*38fd1498Szrj return;
674*38fd1498Szrj }
675*38fd1498Szrj
676*38fd1498Szrj if (__builtin_expect (ws->sched == GFS_STATIC, 1))
677*38fd1498Szrj {
678*38fd1498Szrj if (ws->chunk_size_ull == 0)
679*38fd1498Szrj {
680*38fd1498Szrj if (first < doacross->boundary_ull)
681*38fd1498Szrj ent = first / (doacross->q_ull + 1);
682*38fd1498Szrj else
683*38fd1498Szrj ent = (first - doacross->boundary_ull) / doacross->q_ull
684*38fd1498Szrj + doacross->t;
685*38fd1498Szrj }
686*38fd1498Szrj else
687*38fd1498Szrj ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
688*38fd1498Szrj }
689*38fd1498Szrj else if (ws->sched == GFS_GUIDED)
690*38fd1498Szrj ent = first;
691*38fd1498Szrj else
692*38fd1498Szrj ent = first / doacross->chunk_size_ull;
693*38fd1498Szrj
694*38fd1498Szrj if (__builtin_expect (doacross->flattened, 1))
695*38fd1498Szrj {
696*38fd1498Szrj unsigned long *array = (unsigned long *) (doacross->array
697*38fd1498Szrj + ent * doacross->elt_sz);
698*38fd1498Szrj gomp_ull flattened = first << doacross->shift_counts[0];
699*38fd1498Szrj unsigned long cur;
700*38fd1498Szrj
701*38fd1498Szrj va_start (ap, first);
702*38fd1498Szrj for (i = 1; i < doacross->ncounts; i++)
703*38fd1498Szrj flattened |= va_arg (ap, gomp_ull)
704*38fd1498Szrj << doacross->shift_counts[i];
705*38fd1498Szrj cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
706*38fd1498Szrj if (flattened < cur)
707*38fd1498Szrj {
708*38fd1498Szrj __atomic_thread_fence (MEMMODEL_RELEASE);
709*38fd1498Szrj va_end (ap);
710*38fd1498Szrj return;
711*38fd1498Szrj }
712*38fd1498Szrj doacross_spin (array, flattened, cur);
713*38fd1498Szrj __atomic_thread_fence (MEMMODEL_RELEASE);
714*38fd1498Szrj va_end (ap);
715*38fd1498Szrj return;
716*38fd1498Szrj }
717*38fd1498Szrj
718*38fd1498Szrj if (sizeof (gomp_ull) == sizeof (unsigned long))
719*38fd1498Szrj {
720*38fd1498Szrj gomp_ull *array = (gomp_ull *) (doacross->array
721*38fd1498Szrj + ent * doacross->elt_sz);
722*38fd1498Szrj do
723*38fd1498Szrj {
724*38fd1498Szrj va_start (ap, first);
725*38fd1498Szrj for (i = 0; i < doacross->ncounts; i++)
726*38fd1498Szrj {
727*38fd1498Szrj gomp_ull thisv
728*38fd1498Szrj = (i ? va_arg (ap, gomp_ull) : first) + 1;
729*38fd1498Szrj gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
730*38fd1498Szrj if (thisv < cur)
731*38fd1498Szrj {
732*38fd1498Szrj i = doacross->ncounts;
733*38fd1498Szrj break;
734*38fd1498Szrj }
735*38fd1498Szrj if (thisv > cur)
736*38fd1498Szrj break;
737*38fd1498Szrj }
738*38fd1498Szrj va_end (ap);
739*38fd1498Szrj if (i == doacross->ncounts)
740*38fd1498Szrj break;
741*38fd1498Szrj cpu_relax ();
742*38fd1498Szrj }
743*38fd1498Szrj while (1);
744*38fd1498Szrj }
745*38fd1498Szrj else
746*38fd1498Szrj {
747*38fd1498Szrj unsigned long *array = (unsigned long *) (doacross->array
748*38fd1498Szrj + ent * doacross->elt_sz);
749*38fd1498Szrj do
750*38fd1498Szrj {
751*38fd1498Szrj va_start (ap, first);
752*38fd1498Szrj for (i = 0; i < doacross->ncounts; i++)
753*38fd1498Szrj {
754*38fd1498Szrj gomp_ull thisv
755*38fd1498Szrj = (i ? va_arg (ap, gomp_ull) : first) + 1;
756*38fd1498Szrj unsigned long t
757*38fd1498Szrj = thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
758*38fd1498Szrj unsigned long cur
759*38fd1498Szrj = __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED);
760*38fd1498Szrj if (t < cur)
761*38fd1498Szrj {
762*38fd1498Szrj i = doacross->ncounts;
763*38fd1498Szrj break;
764*38fd1498Szrj }
765*38fd1498Szrj if (t > cur)
766*38fd1498Szrj break;
767*38fd1498Szrj t = thisv;
768*38fd1498Szrj cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED);
769*38fd1498Szrj if (t < cur)
770*38fd1498Szrj {
771*38fd1498Szrj i = doacross->ncounts;
772*38fd1498Szrj break;
773*38fd1498Szrj }
774*38fd1498Szrj if (t > cur)
775*38fd1498Szrj break;
776*38fd1498Szrj }
777*38fd1498Szrj va_end (ap);
778*38fd1498Szrj if (i == doacross->ncounts)
779*38fd1498Szrj break;
780*38fd1498Szrj cpu_relax ();
781*38fd1498Szrj }
782*38fd1498Szrj while (1);
783*38fd1498Szrj }
784*38fd1498Szrj __sync_synchronize ();
785*38fd1498Szrj }
786