1*8feb0f0bSmrg /* Copyright (C) 2005-2020 Free Software Foundation, Inc.
21debfc3dSmrg Contributed by Richard Henderson <rth@redhat.com>.
31debfc3dSmrg
41debfc3dSmrg This file is part of the GNU Offloading and Multi Processing Library
51debfc3dSmrg (libgomp).
61debfc3dSmrg
71debfc3dSmrg Libgomp is free software; you can redistribute it and/or modify it
81debfc3dSmrg under the terms of the GNU General Public License as published by
91debfc3dSmrg the Free Software Foundation; either version 3, or (at your option)
101debfc3dSmrg any later version.
111debfc3dSmrg
121debfc3dSmrg Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
131debfc3dSmrg WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
141debfc3dSmrg FOR A PARTICULAR PURPOSE. See the GNU General Public License for
151debfc3dSmrg more details.
161debfc3dSmrg
171debfc3dSmrg Under Section 7 of GPL version 3, you are granted additional
181debfc3dSmrg permissions described in the GCC Runtime Library Exception, version
191debfc3dSmrg 3.1, as published by the Free Software Foundation.
201debfc3dSmrg
211debfc3dSmrg You should have received a copy of the GNU General Public License and
221debfc3dSmrg a copy of the GCC Runtime Library Exception along with this program;
231debfc3dSmrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
241debfc3dSmrg <http://www.gnu.org/licenses/>. */
251debfc3dSmrg
261debfc3dSmrg /* This file handles the ORDERED construct. */
271debfc3dSmrg
281debfc3dSmrg #include "libgomp.h"
291debfc3dSmrg #include <stdarg.h>
301debfc3dSmrg #include <string.h>
311debfc3dSmrg #include "doacross.h"
321debfc3dSmrg
331debfc3dSmrg
341debfc3dSmrg /* This function is called when first allocating an iteration block. That
351debfc3dSmrg is, the thread is not currently on the queue. The work-share lock must
361debfc3dSmrg be held on entry. */
371debfc3dSmrg
381debfc3dSmrg void
gomp_ordered_first(void)391debfc3dSmrg gomp_ordered_first (void)
401debfc3dSmrg {
411debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
421debfc3dSmrg struct gomp_team *team = thr->ts.team;
431debfc3dSmrg struct gomp_work_share *ws = thr->ts.work_share;
441debfc3dSmrg unsigned index;
451debfc3dSmrg
461debfc3dSmrg /* Work share constructs can be orphaned. */
471debfc3dSmrg if (team == NULL || team->nthreads == 1)
481debfc3dSmrg return;
491debfc3dSmrg
501debfc3dSmrg index = ws->ordered_cur + ws->ordered_num_used;
511debfc3dSmrg if (index >= team->nthreads)
521debfc3dSmrg index -= team->nthreads;
531debfc3dSmrg ws->ordered_team_ids[index] = thr->ts.team_id;
541debfc3dSmrg
551debfc3dSmrg /* If this is the first and only thread in the queue, then there is
561debfc3dSmrg no one to release us when we get to our ordered section. Post to
571debfc3dSmrg our own release queue now so that we won't block later. */
581debfc3dSmrg if (ws->ordered_num_used++ == 0)
591debfc3dSmrg gomp_sem_post (team->ordered_release[thr->ts.team_id]);
601debfc3dSmrg }
611debfc3dSmrg
621debfc3dSmrg /* This function is called when completing the last iteration block. That
631debfc3dSmrg is, there are no more iterations to perform and so the thread should be
641debfc3dSmrg removed from the queue entirely. Because of the way ORDERED blocks are
651debfc3dSmrg managed, it follows that we currently own access to the ORDERED block,
661debfc3dSmrg and should now pass it on to the next thread. The work-share lock must
671debfc3dSmrg be held on entry. */
681debfc3dSmrg
691debfc3dSmrg void
gomp_ordered_last(void)701debfc3dSmrg gomp_ordered_last (void)
711debfc3dSmrg {
721debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
731debfc3dSmrg struct gomp_team *team = thr->ts.team;
741debfc3dSmrg struct gomp_work_share *ws = thr->ts.work_share;
751debfc3dSmrg unsigned next_id;
761debfc3dSmrg
771debfc3dSmrg /* Work share constructs can be orphaned. */
781debfc3dSmrg if (team == NULL || team->nthreads == 1)
791debfc3dSmrg return;
801debfc3dSmrg
811debfc3dSmrg /* We're no longer the owner. */
821debfc3dSmrg ws->ordered_owner = -1;
831debfc3dSmrg
841debfc3dSmrg /* If we're not the last thread in the queue, then wake the next. */
851debfc3dSmrg if (--ws->ordered_num_used > 0)
861debfc3dSmrg {
871debfc3dSmrg unsigned next = ws->ordered_cur + 1;
881debfc3dSmrg if (next == team->nthreads)
891debfc3dSmrg next = 0;
901debfc3dSmrg ws->ordered_cur = next;
911debfc3dSmrg
921debfc3dSmrg next_id = ws->ordered_team_ids[next];
931debfc3dSmrg gomp_sem_post (team->ordered_release[next_id]);
941debfc3dSmrg }
951debfc3dSmrg }
961debfc3dSmrg
971debfc3dSmrg
981debfc3dSmrg /* This function is called when allocating a subsequent allocation block.
991debfc3dSmrg That is, we're done with the current iteration block and we're allocating
1001debfc3dSmrg another. This is the logical combination of a call to gomp_ordered_last
1011debfc3dSmrg followed by a call to gomp_ordered_first. The work-share lock must be
1021debfc3dSmrg held on entry. */
1031debfc3dSmrg
1041debfc3dSmrg void
gomp_ordered_next(void)1051debfc3dSmrg gomp_ordered_next (void)
1061debfc3dSmrg {
1071debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
1081debfc3dSmrg struct gomp_team *team = thr->ts.team;
1091debfc3dSmrg struct gomp_work_share *ws = thr->ts.work_share;
1101debfc3dSmrg unsigned index, next_id;
1111debfc3dSmrg
1121debfc3dSmrg /* Work share constructs can be orphaned. */
1131debfc3dSmrg if (team == NULL || team->nthreads == 1)
1141debfc3dSmrg return;
1151debfc3dSmrg
1161debfc3dSmrg /* We're no longer the owner. */
1171debfc3dSmrg ws->ordered_owner = -1;
1181debfc3dSmrg
1191debfc3dSmrg /* If there's only one thread in the queue, that must be us. */
1201debfc3dSmrg if (ws->ordered_num_used == 1)
1211debfc3dSmrg {
1221debfc3dSmrg /* We have a similar situation as in gomp_ordered_first
1231debfc3dSmrg where we need to post to our own release semaphore. */
1241debfc3dSmrg gomp_sem_post (team->ordered_release[thr->ts.team_id]);
1251debfc3dSmrg return;
1261debfc3dSmrg }
1271debfc3dSmrg
1281debfc3dSmrg /* If the queue is entirely full, then we move ourself to the end of
1291debfc3dSmrg the queue merely by incrementing ordered_cur. Only if it's not
1301debfc3dSmrg full do we have to write our id. */
1311debfc3dSmrg if (ws->ordered_num_used < team->nthreads)
1321debfc3dSmrg {
1331debfc3dSmrg index = ws->ordered_cur + ws->ordered_num_used;
1341debfc3dSmrg if (index >= team->nthreads)
1351debfc3dSmrg index -= team->nthreads;
1361debfc3dSmrg ws->ordered_team_ids[index] = thr->ts.team_id;
1371debfc3dSmrg }
1381debfc3dSmrg
1391debfc3dSmrg index = ws->ordered_cur + 1;
1401debfc3dSmrg if (index == team->nthreads)
1411debfc3dSmrg index = 0;
1421debfc3dSmrg ws->ordered_cur = index;
1431debfc3dSmrg
1441debfc3dSmrg next_id = ws->ordered_team_ids[index];
1451debfc3dSmrg gomp_sem_post (team->ordered_release[next_id]);
1461debfc3dSmrg }
1471debfc3dSmrg
1481debfc3dSmrg
1491debfc3dSmrg /* This function is called when a statically scheduled loop is first
1501debfc3dSmrg being created. */
1511debfc3dSmrg
1521debfc3dSmrg void
gomp_ordered_static_init(void)1531debfc3dSmrg gomp_ordered_static_init (void)
1541debfc3dSmrg {
1551debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
1561debfc3dSmrg struct gomp_team *team = thr->ts.team;
1571debfc3dSmrg
1581debfc3dSmrg if (team == NULL || team->nthreads == 1)
1591debfc3dSmrg return;
1601debfc3dSmrg
1611debfc3dSmrg gomp_sem_post (team->ordered_release[0]);
1621debfc3dSmrg }
1631debfc3dSmrg
1641debfc3dSmrg /* This function is called when a statically scheduled loop is moving to
1651debfc3dSmrg the next allocation block. Static schedules are not first come first
1661debfc3dSmrg served like the others, so we're to move to the numerically next thread,
1671debfc3dSmrg not the next thread on a list. The work-share lock should *not* be held
1681debfc3dSmrg on entry. */
1691debfc3dSmrg
1701debfc3dSmrg void
gomp_ordered_static_next(void)1711debfc3dSmrg gomp_ordered_static_next (void)
1721debfc3dSmrg {
1731debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
1741debfc3dSmrg struct gomp_team *team = thr->ts.team;
1751debfc3dSmrg struct gomp_work_share *ws = thr->ts.work_share;
1761debfc3dSmrg unsigned id = thr->ts.team_id;
1771debfc3dSmrg
1781debfc3dSmrg if (team == NULL || team->nthreads == 1)
1791debfc3dSmrg return;
1801debfc3dSmrg
1811debfc3dSmrg ws->ordered_owner = -1;
1821debfc3dSmrg
1831debfc3dSmrg /* This thread currently owns the lock. Increment the owner. */
1841debfc3dSmrg if (++id == team->nthreads)
1851debfc3dSmrg id = 0;
1861debfc3dSmrg ws->ordered_team_ids[0] = id;
1871debfc3dSmrg gomp_sem_post (team->ordered_release[id]);
1881debfc3dSmrg }
1891debfc3dSmrg
1901debfc3dSmrg /* This function is called when we need to assert that the thread owns the
1911debfc3dSmrg ordered section. Due to the problem of posted-but-not-waited semaphores,
1921debfc3dSmrg this needs to happen before completing a loop iteration. */
1931debfc3dSmrg
1941debfc3dSmrg void
gomp_ordered_sync(void)1951debfc3dSmrg gomp_ordered_sync (void)
1961debfc3dSmrg {
1971debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
1981debfc3dSmrg struct gomp_team *team = thr->ts.team;
1991debfc3dSmrg struct gomp_work_share *ws = thr->ts.work_share;
2001debfc3dSmrg
2011debfc3dSmrg /* Work share constructs can be orphaned. But this clearly means that
2021debfc3dSmrg we are the only thread, and so we automatically own the section. */
2031debfc3dSmrg if (team == NULL || team->nthreads == 1)
2041debfc3dSmrg return;
2051debfc3dSmrg
2061debfc3dSmrg /* ??? I believe it to be safe to access this data without taking the
2071debfc3dSmrg ws->lock. The only presumed race condition is with the previous
2081debfc3dSmrg thread on the queue incrementing ordered_cur such that it points
2091debfc3dSmrg to us, concurrently with our check below. But our team_id is
2101debfc3dSmrg already present in the queue, and the other thread will always
2111debfc3dSmrg post to our release semaphore. So the two cases are that we will
2121debfc3dSmrg either win the race an momentarily block on the semaphore, or lose
2131debfc3dSmrg the race and find the semaphore already unlocked and so not block.
2141debfc3dSmrg Either way we get correct results.
2151debfc3dSmrg However, there is an implicit flush on entry to an ordered region,
2161debfc3dSmrg so we do need to have a barrier here. If we were taking a lock
217*8feb0f0bSmrg this could be MEMMODEL_RELEASE since the acquire would be covered
2181debfc3dSmrg by the lock. */
2191debfc3dSmrg
2201debfc3dSmrg __atomic_thread_fence (MEMMODEL_ACQ_REL);
2211debfc3dSmrg if (ws->ordered_owner != thr->ts.team_id)
2221debfc3dSmrg {
2231debfc3dSmrg gomp_sem_wait (team->ordered_release[thr->ts.team_id]);
2241debfc3dSmrg ws->ordered_owner = thr->ts.team_id;
2251debfc3dSmrg }
2261debfc3dSmrg }
2271debfc3dSmrg
2281debfc3dSmrg /* This function is called by user code when encountering the start of an
2291debfc3dSmrg ORDERED block. We must check to see if the current thread is at the
2301debfc3dSmrg head of the queue, and if not, block. */
2311debfc3dSmrg
2321debfc3dSmrg #ifdef HAVE_ATTRIBUTE_ALIAS
2331debfc3dSmrg extern void GOMP_ordered_start (void)
2341debfc3dSmrg __attribute__((alias ("gomp_ordered_sync")));
2351debfc3dSmrg #else
2361debfc3dSmrg void
GOMP_ordered_start(void)2371debfc3dSmrg GOMP_ordered_start (void)
2381debfc3dSmrg {
2391debfc3dSmrg gomp_ordered_sync ();
2401debfc3dSmrg }
2411debfc3dSmrg #endif
2421debfc3dSmrg
2431debfc3dSmrg /* This function is called by user code when encountering the end of an
2441debfc3dSmrg ORDERED block. With the current ORDERED implementation there's nothing
2451debfc3dSmrg for us to do.
2461debfc3dSmrg
2471debfc3dSmrg However, the current implementation has a flaw in that it does not allow
2481debfc3dSmrg the next thread into the ORDERED section immediately after the current
249*8feb0f0bSmrg thread exits the ORDERED section in its last iteration. The existence
2501debfc3dSmrg of this function allows the implementation to change. */
2511debfc3dSmrg
2521debfc3dSmrg void
GOMP_ordered_end(void)2531debfc3dSmrg GOMP_ordered_end (void)
2541debfc3dSmrg {
2551debfc3dSmrg }
2561debfc3dSmrg
2571debfc3dSmrg /* DOACROSS initialization. */
2581debfc3dSmrg
2591debfc3dSmrg #define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
2601debfc3dSmrg
2611debfc3dSmrg void
gomp_doacross_init(unsigned ncounts,long * counts,long chunk_size,size_t extra)262c0a68be4Smrg gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size,
263c0a68be4Smrg size_t extra)
2641debfc3dSmrg {
2651debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
2661debfc3dSmrg struct gomp_team *team = thr->ts.team;
2671debfc3dSmrg struct gomp_work_share *ws = thr->ts.work_share;
2681debfc3dSmrg unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
2691debfc3dSmrg unsigned long ent, num_ents, elt_sz, shift_sz;
2701debfc3dSmrg struct gomp_doacross_work_share *doacross;
2711debfc3dSmrg
2721debfc3dSmrg if (team == NULL || team->nthreads == 1)
273c0a68be4Smrg {
274c0a68be4Smrg empty:
275c0a68be4Smrg if (!extra)
276c0a68be4Smrg ws->doacross = NULL;
277c0a68be4Smrg else
278c0a68be4Smrg {
279c0a68be4Smrg doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
280c0a68be4Smrg doacross->extra = (void *) (doacross + 1);
281c0a68be4Smrg ws->doacross = doacross;
282c0a68be4Smrg }
2831debfc3dSmrg return;
284c0a68be4Smrg }
2851debfc3dSmrg
2861debfc3dSmrg for (i = 0; i < ncounts; i++)
2871debfc3dSmrg {
2881debfc3dSmrg /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */
2891debfc3dSmrg if (counts[i] == 0)
290c0a68be4Smrg goto empty;
2911debfc3dSmrg
2921debfc3dSmrg if (num_bits <= MAX_COLLAPSED_BITS)
2931debfc3dSmrg {
2941debfc3dSmrg unsigned int this_bits;
2951debfc3dSmrg if (counts[i] == 1)
2961debfc3dSmrg this_bits = 1;
2971debfc3dSmrg else
2981debfc3dSmrg this_bits = __SIZEOF_LONG__ * __CHAR_BIT__
2991debfc3dSmrg - __builtin_clzl (counts[i] - 1);
3001debfc3dSmrg if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
3011debfc3dSmrg {
3021debfc3dSmrg bits[i] = this_bits;
3031debfc3dSmrg num_bits += this_bits;
3041debfc3dSmrg }
3051debfc3dSmrg else
3061debfc3dSmrg num_bits = MAX_COLLAPSED_BITS + 1;
3071debfc3dSmrg }
3081debfc3dSmrg }
3091debfc3dSmrg
3101debfc3dSmrg if (ws->sched == GFS_STATIC)
3111debfc3dSmrg num_ents = team->nthreads;
3121debfc3dSmrg else if (ws->sched == GFS_GUIDED)
3131debfc3dSmrg num_ents = counts[0];
3141debfc3dSmrg else
3151debfc3dSmrg num_ents = (counts[0] - 1) / chunk_size + 1;
3161debfc3dSmrg if (num_bits <= MAX_COLLAPSED_BITS)
3171debfc3dSmrg {
3181debfc3dSmrg elt_sz = sizeof (unsigned long);
3191debfc3dSmrg shift_sz = ncounts * sizeof (unsigned int);
3201debfc3dSmrg }
3211debfc3dSmrg else
3221debfc3dSmrg {
3231debfc3dSmrg elt_sz = sizeof (unsigned long) * ncounts;
3241debfc3dSmrg shift_sz = 0;
3251debfc3dSmrg }
3261debfc3dSmrg elt_sz = (elt_sz + 63) & ~63UL;
3271debfc3dSmrg
3281debfc3dSmrg doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
329c0a68be4Smrg + shift_sz + extra);
3301debfc3dSmrg doacross->chunk_size = chunk_size;
3311debfc3dSmrg doacross->elt_sz = elt_sz;
3321debfc3dSmrg doacross->ncounts = ncounts;
3331debfc3dSmrg doacross->flattened = false;
3341debfc3dSmrg doacross->array = (unsigned char *)
3351debfc3dSmrg ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
3361debfc3dSmrg & ~(uintptr_t) 63);
337c0a68be4Smrg if (extra)
338c0a68be4Smrg {
339c0a68be4Smrg doacross->extra = doacross->array + num_ents * elt_sz;
340c0a68be4Smrg memset (doacross->extra, '\0', extra);
341c0a68be4Smrg }
342c0a68be4Smrg else
343c0a68be4Smrg doacross->extra = NULL;
3441debfc3dSmrg if (num_bits <= MAX_COLLAPSED_BITS)
3451debfc3dSmrg {
3461debfc3dSmrg unsigned int shift_count = 0;
3471debfc3dSmrg doacross->flattened = true;
3481debfc3dSmrg for (i = ncounts; i > 0; i--)
3491debfc3dSmrg {
3501debfc3dSmrg doacross->shift_counts[i - 1] = shift_count;
3511debfc3dSmrg shift_count += bits[i - 1];
3521debfc3dSmrg }
3531debfc3dSmrg for (ent = 0; ent < num_ents; ent++)
3541debfc3dSmrg *(unsigned long *) (doacross->array + ent * elt_sz) = 0;
3551debfc3dSmrg }
3561debfc3dSmrg else
3571debfc3dSmrg for (ent = 0; ent < num_ents; ent++)
3581debfc3dSmrg memset (doacross->array + ent * elt_sz, '\0',
3591debfc3dSmrg sizeof (unsigned long) * ncounts);
3601debfc3dSmrg if (ws->sched == GFS_STATIC && chunk_size == 0)
3611debfc3dSmrg {
3621debfc3dSmrg unsigned long q = counts[0] / num_ents;
3631debfc3dSmrg unsigned long t = counts[0] % num_ents;
3641debfc3dSmrg doacross->boundary = t * (q + 1);
3651debfc3dSmrg doacross->q = q;
3661debfc3dSmrg doacross->t = t;
3671debfc3dSmrg }
3681debfc3dSmrg ws->doacross = doacross;
3691debfc3dSmrg }
3701debfc3dSmrg
3711debfc3dSmrg /* DOACROSS POST operation. */
3721debfc3dSmrg
3731debfc3dSmrg void
GOMP_doacross_post(long * counts)3741debfc3dSmrg GOMP_doacross_post (long *counts)
3751debfc3dSmrg {
3761debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
3771debfc3dSmrg struct gomp_work_share *ws = thr->ts.work_share;
3781debfc3dSmrg struct gomp_doacross_work_share *doacross = ws->doacross;
3791debfc3dSmrg unsigned long ent;
3801debfc3dSmrg unsigned int i;
3811debfc3dSmrg
382c0a68be4Smrg if (__builtin_expect (doacross == NULL, 0)
383c0a68be4Smrg || __builtin_expect (doacross->array == NULL, 0))
3841debfc3dSmrg {
3851debfc3dSmrg __sync_synchronize ();
3861debfc3dSmrg return;
3871debfc3dSmrg }
3881debfc3dSmrg
3891debfc3dSmrg if (__builtin_expect (ws->sched == GFS_STATIC, 1))
3901debfc3dSmrg ent = thr->ts.team_id;
3911debfc3dSmrg else if (ws->sched == GFS_GUIDED)
3921debfc3dSmrg ent = counts[0];
3931debfc3dSmrg else
3941debfc3dSmrg ent = counts[0] / doacross->chunk_size;
3951debfc3dSmrg unsigned long *array = (unsigned long *) (doacross->array
3961debfc3dSmrg + ent * doacross->elt_sz);
3971debfc3dSmrg
3981debfc3dSmrg if (__builtin_expect (doacross->flattened, 1))
3991debfc3dSmrg {
4001debfc3dSmrg unsigned long flattened
4011debfc3dSmrg = (unsigned long) counts[0] << doacross->shift_counts[0];
4021debfc3dSmrg
4031debfc3dSmrg for (i = 1; i < doacross->ncounts; i++)
4041debfc3dSmrg flattened |= (unsigned long) counts[i]
4051debfc3dSmrg << doacross->shift_counts[i];
4061debfc3dSmrg flattened++;
4071debfc3dSmrg if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
4081debfc3dSmrg __atomic_thread_fence (MEMMODEL_RELEASE);
4091debfc3dSmrg else
4101debfc3dSmrg __atomic_store_n (array, flattened, MEMMODEL_RELEASE);
4111debfc3dSmrg return;
4121debfc3dSmrg }
4131debfc3dSmrg
4141debfc3dSmrg __atomic_thread_fence (MEMMODEL_ACQUIRE);
4151debfc3dSmrg for (i = doacross->ncounts; i-- > 0; )
4161debfc3dSmrg {
4171debfc3dSmrg if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
4181debfc3dSmrg __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
4191debfc3dSmrg }
4201debfc3dSmrg }
4211debfc3dSmrg
4221debfc3dSmrg /* DOACROSS WAIT operation. */
4231debfc3dSmrg
4241debfc3dSmrg void
GOMP_doacross_wait(long first,...)4251debfc3dSmrg GOMP_doacross_wait (long first, ...)
4261debfc3dSmrg {
4271debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
4281debfc3dSmrg struct gomp_work_share *ws = thr->ts.work_share;
4291debfc3dSmrg struct gomp_doacross_work_share *doacross = ws->doacross;
4301debfc3dSmrg va_list ap;
4311debfc3dSmrg unsigned long ent;
4321debfc3dSmrg unsigned int i;
4331debfc3dSmrg
434c0a68be4Smrg if (__builtin_expect (doacross == NULL, 0)
435c0a68be4Smrg || __builtin_expect (doacross->array == NULL, 0))
4361debfc3dSmrg {
4371debfc3dSmrg __sync_synchronize ();
4381debfc3dSmrg return;
4391debfc3dSmrg }
4401debfc3dSmrg
4411debfc3dSmrg if (__builtin_expect (ws->sched == GFS_STATIC, 1))
4421debfc3dSmrg {
4431debfc3dSmrg if (ws->chunk_size == 0)
4441debfc3dSmrg {
4451debfc3dSmrg if (first < doacross->boundary)
4461debfc3dSmrg ent = first / (doacross->q + 1);
4471debfc3dSmrg else
4481debfc3dSmrg ent = (first - doacross->boundary) / doacross->q
4491debfc3dSmrg + doacross->t;
4501debfc3dSmrg }
4511debfc3dSmrg else
4521debfc3dSmrg ent = first / ws->chunk_size % thr->ts.team->nthreads;
4531debfc3dSmrg }
4541debfc3dSmrg else if (ws->sched == GFS_GUIDED)
4551debfc3dSmrg ent = first;
4561debfc3dSmrg else
4571debfc3dSmrg ent = first / doacross->chunk_size;
4581debfc3dSmrg unsigned long *array = (unsigned long *) (doacross->array
4591debfc3dSmrg + ent * doacross->elt_sz);
4601debfc3dSmrg
4611debfc3dSmrg if (__builtin_expect (doacross->flattened, 1))
4621debfc3dSmrg {
4631debfc3dSmrg unsigned long flattened
4641debfc3dSmrg = (unsigned long) first << doacross->shift_counts[0];
4651debfc3dSmrg unsigned long cur;
4661debfc3dSmrg
4671debfc3dSmrg va_start (ap, first);
4681debfc3dSmrg for (i = 1; i < doacross->ncounts; i++)
4691debfc3dSmrg flattened |= (unsigned long) va_arg (ap, long)
4701debfc3dSmrg << doacross->shift_counts[i];
4711debfc3dSmrg cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
4721debfc3dSmrg if (flattened < cur)
4731debfc3dSmrg {
4741debfc3dSmrg __atomic_thread_fence (MEMMODEL_RELEASE);
4751debfc3dSmrg va_end (ap);
4761debfc3dSmrg return;
4771debfc3dSmrg }
4781debfc3dSmrg doacross_spin (array, flattened, cur);
4791debfc3dSmrg __atomic_thread_fence (MEMMODEL_RELEASE);
4801debfc3dSmrg va_end (ap);
4811debfc3dSmrg return;
4821debfc3dSmrg }
4831debfc3dSmrg
4841debfc3dSmrg do
4851debfc3dSmrg {
4861debfc3dSmrg va_start (ap, first);
4871debfc3dSmrg for (i = 0; i < doacross->ncounts; i++)
4881debfc3dSmrg {
4891debfc3dSmrg unsigned long thisv
4901debfc3dSmrg = (unsigned long) (i ? va_arg (ap, long) : first) + 1;
4911debfc3dSmrg unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
4921debfc3dSmrg if (thisv < cur)
4931debfc3dSmrg {
4941debfc3dSmrg i = doacross->ncounts;
4951debfc3dSmrg break;
4961debfc3dSmrg }
4971debfc3dSmrg if (thisv > cur)
4981debfc3dSmrg break;
4991debfc3dSmrg }
5001debfc3dSmrg va_end (ap);
5011debfc3dSmrg if (i == doacross->ncounts)
5021debfc3dSmrg break;
5031debfc3dSmrg cpu_relax ();
5041debfc3dSmrg }
5051debfc3dSmrg while (1);
5061debfc3dSmrg __sync_synchronize ();
5071debfc3dSmrg }
5081debfc3dSmrg
5091debfc3dSmrg typedef unsigned long long gomp_ull;
5101debfc3dSmrg
5111debfc3dSmrg void
gomp_doacross_ull_init(unsigned ncounts,gomp_ull * counts,gomp_ull chunk_size,size_t extra)512c0a68be4Smrg gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts,
513c0a68be4Smrg gomp_ull chunk_size, size_t extra)
5141debfc3dSmrg {
5151debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
5161debfc3dSmrg struct gomp_team *team = thr->ts.team;
5171debfc3dSmrg struct gomp_work_share *ws = thr->ts.work_share;
5181debfc3dSmrg unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
5191debfc3dSmrg unsigned long ent, num_ents, elt_sz, shift_sz;
5201debfc3dSmrg struct gomp_doacross_work_share *doacross;
5211debfc3dSmrg
5221debfc3dSmrg if (team == NULL || team->nthreads == 1)
523c0a68be4Smrg {
524c0a68be4Smrg empty:
525c0a68be4Smrg if (!extra)
526c0a68be4Smrg ws->doacross = NULL;
527c0a68be4Smrg else
528c0a68be4Smrg {
529c0a68be4Smrg doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
530c0a68be4Smrg doacross->extra = (void *) (doacross + 1);
531c0a68be4Smrg ws->doacross = doacross;
532c0a68be4Smrg }
5331debfc3dSmrg return;
534c0a68be4Smrg }
5351debfc3dSmrg
5361debfc3dSmrg for (i = 0; i < ncounts; i++)
5371debfc3dSmrg {
5381debfc3dSmrg /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */
5391debfc3dSmrg if (counts[i] == 0)
540c0a68be4Smrg goto empty;
5411debfc3dSmrg
5421debfc3dSmrg if (num_bits <= MAX_COLLAPSED_BITS)
5431debfc3dSmrg {
5441debfc3dSmrg unsigned int this_bits;
5451debfc3dSmrg if (counts[i] == 1)
5461debfc3dSmrg this_bits = 1;
5471debfc3dSmrg else
5481debfc3dSmrg this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__
5491debfc3dSmrg - __builtin_clzll (counts[i] - 1);
5501debfc3dSmrg if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
5511debfc3dSmrg {
5521debfc3dSmrg bits[i] = this_bits;
5531debfc3dSmrg num_bits += this_bits;
5541debfc3dSmrg }
5551debfc3dSmrg else
5561debfc3dSmrg num_bits = MAX_COLLAPSED_BITS + 1;
5571debfc3dSmrg }
5581debfc3dSmrg }
5591debfc3dSmrg
5601debfc3dSmrg if (ws->sched == GFS_STATIC)
5611debfc3dSmrg num_ents = team->nthreads;
5621debfc3dSmrg else if (ws->sched == GFS_GUIDED)
5631debfc3dSmrg num_ents = counts[0];
5641debfc3dSmrg else
5651debfc3dSmrg num_ents = (counts[0] - 1) / chunk_size + 1;
5661debfc3dSmrg if (num_bits <= MAX_COLLAPSED_BITS)
5671debfc3dSmrg {
5681debfc3dSmrg elt_sz = sizeof (unsigned long);
5691debfc3dSmrg shift_sz = ncounts * sizeof (unsigned int);
5701debfc3dSmrg }
5711debfc3dSmrg else
5721debfc3dSmrg {
5731debfc3dSmrg if (sizeof (gomp_ull) == sizeof (unsigned long))
5741debfc3dSmrg elt_sz = sizeof (gomp_ull) * ncounts;
5751debfc3dSmrg else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long))
5761debfc3dSmrg elt_sz = sizeof (unsigned long) * 2 * ncounts;
5771debfc3dSmrg else
5781debfc3dSmrg abort ();
5791debfc3dSmrg shift_sz = 0;
5801debfc3dSmrg }
5811debfc3dSmrg elt_sz = (elt_sz + 63) & ~63UL;
5821debfc3dSmrg
5831debfc3dSmrg doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
5841debfc3dSmrg + shift_sz);
5851debfc3dSmrg doacross->chunk_size_ull = chunk_size;
5861debfc3dSmrg doacross->elt_sz = elt_sz;
5871debfc3dSmrg doacross->ncounts = ncounts;
5881debfc3dSmrg doacross->flattened = false;
5891debfc3dSmrg doacross->boundary = 0;
5901debfc3dSmrg doacross->array = (unsigned char *)
5911debfc3dSmrg ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
5921debfc3dSmrg & ~(uintptr_t) 63);
593c0a68be4Smrg if (extra)
594c0a68be4Smrg {
595c0a68be4Smrg doacross->extra = doacross->array + num_ents * elt_sz;
596c0a68be4Smrg memset (doacross->extra, '\0', extra);
597c0a68be4Smrg }
598c0a68be4Smrg else
599c0a68be4Smrg doacross->extra = NULL;
6001debfc3dSmrg if (num_bits <= MAX_COLLAPSED_BITS)
6011debfc3dSmrg {
6021debfc3dSmrg unsigned int shift_count = 0;
6031debfc3dSmrg doacross->flattened = true;
6041debfc3dSmrg for (i = ncounts; i > 0; i--)
6051debfc3dSmrg {
6061debfc3dSmrg doacross->shift_counts[i - 1] = shift_count;
6071debfc3dSmrg shift_count += bits[i - 1];
6081debfc3dSmrg }
6091debfc3dSmrg for (ent = 0; ent < num_ents; ent++)
6101debfc3dSmrg *(unsigned long *) (doacross->array + ent * elt_sz) = 0;
6111debfc3dSmrg }
6121debfc3dSmrg else
6131debfc3dSmrg for (ent = 0; ent < num_ents; ent++)
6141debfc3dSmrg memset (doacross->array + ent * elt_sz, '\0',
6151debfc3dSmrg sizeof (unsigned long) * ncounts);
6161debfc3dSmrg if (ws->sched == GFS_STATIC && chunk_size == 0)
6171debfc3dSmrg {
6181debfc3dSmrg gomp_ull q = counts[0] / num_ents;
6191debfc3dSmrg gomp_ull t = counts[0] % num_ents;
6201debfc3dSmrg doacross->boundary_ull = t * (q + 1);
6211debfc3dSmrg doacross->q_ull = q;
6221debfc3dSmrg doacross->t = t;
6231debfc3dSmrg }
6241debfc3dSmrg ws->doacross = doacross;
6251debfc3dSmrg }
6261debfc3dSmrg
6271debfc3dSmrg /* DOACROSS POST operation. */
6281debfc3dSmrg
6291debfc3dSmrg void
GOMP_doacross_ull_post(gomp_ull * counts)6301debfc3dSmrg GOMP_doacross_ull_post (gomp_ull *counts)
6311debfc3dSmrg {
6321debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
6331debfc3dSmrg struct gomp_work_share *ws = thr->ts.work_share;
6341debfc3dSmrg struct gomp_doacross_work_share *doacross = ws->doacross;
6351debfc3dSmrg unsigned long ent;
6361debfc3dSmrg unsigned int i;
6371debfc3dSmrg
638c0a68be4Smrg if (__builtin_expect (doacross == NULL, 0)
639c0a68be4Smrg || __builtin_expect (doacross->array == NULL, 0))
6401debfc3dSmrg {
6411debfc3dSmrg __sync_synchronize ();
6421debfc3dSmrg return;
6431debfc3dSmrg }
6441debfc3dSmrg
6451debfc3dSmrg if (__builtin_expect (ws->sched == GFS_STATIC, 1))
6461debfc3dSmrg ent = thr->ts.team_id;
6471debfc3dSmrg else if (ws->sched == GFS_GUIDED)
6481debfc3dSmrg ent = counts[0];
6491debfc3dSmrg else
6501debfc3dSmrg ent = counts[0] / doacross->chunk_size_ull;
6511debfc3dSmrg
6521debfc3dSmrg if (__builtin_expect (doacross->flattened, 1))
6531debfc3dSmrg {
6541debfc3dSmrg unsigned long *array = (unsigned long *) (doacross->array
6551debfc3dSmrg + ent * doacross->elt_sz);
6561debfc3dSmrg gomp_ull flattened
6571debfc3dSmrg = counts[0] << doacross->shift_counts[0];
6581debfc3dSmrg
6591debfc3dSmrg for (i = 1; i < doacross->ncounts; i++)
6601debfc3dSmrg flattened |= counts[i] << doacross->shift_counts[i];
6611debfc3dSmrg flattened++;
6621debfc3dSmrg if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
6631debfc3dSmrg __atomic_thread_fence (MEMMODEL_RELEASE);
6641debfc3dSmrg else
6651debfc3dSmrg __atomic_store_n (array, flattened, MEMMODEL_RELEASE);
6661debfc3dSmrg return;
6671debfc3dSmrg }
6681debfc3dSmrg
6691debfc3dSmrg __atomic_thread_fence (MEMMODEL_ACQUIRE);
6701debfc3dSmrg if (sizeof (gomp_ull) == sizeof (unsigned long))
6711debfc3dSmrg {
6721debfc3dSmrg gomp_ull *array = (gomp_ull *) (doacross->array
6731debfc3dSmrg + ent * doacross->elt_sz);
6741debfc3dSmrg
6751debfc3dSmrg for (i = doacross->ncounts; i-- > 0; )
6761debfc3dSmrg {
6771debfc3dSmrg if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
6781debfc3dSmrg __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
6791debfc3dSmrg }
6801debfc3dSmrg }
6811debfc3dSmrg else
6821debfc3dSmrg {
6831debfc3dSmrg unsigned long *array = (unsigned long *) (doacross->array
6841debfc3dSmrg + ent * doacross->elt_sz);
6851debfc3dSmrg
6861debfc3dSmrg for (i = doacross->ncounts; i-- > 0; )
6871debfc3dSmrg {
6881debfc3dSmrg gomp_ull cull = counts[i] + 1UL;
6891debfc3dSmrg unsigned long c = (unsigned long) cull;
6901debfc3dSmrg if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED))
6911debfc3dSmrg __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE);
6921debfc3dSmrg c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
6931debfc3dSmrg if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED))
6941debfc3dSmrg __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE);
6951debfc3dSmrg }
6961debfc3dSmrg }
6971debfc3dSmrg }
6981debfc3dSmrg
6991debfc3dSmrg /* DOACROSS WAIT operation. */
7001debfc3dSmrg
7011debfc3dSmrg void
GOMP_doacross_ull_wait(gomp_ull first,...)7021debfc3dSmrg GOMP_doacross_ull_wait (gomp_ull first, ...)
7031debfc3dSmrg {
7041debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
7051debfc3dSmrg struct gomp_work_share *ws = thr->ts.work_share;
7061debfc3dSmrg struct gomp_doacross_work_share *doacross = ws->doacross;
7071debfc3dSmrg va_list ap;
7081debfc3dSmrg unsigned long ent;
7091debfc3dSmrg unsigned int i;
7101debfc3dSmrg
711c0a68be4Smrg if (__builtin_expect (doacross == NULL, 0)
712c0a68be4Smrg || __builtin_expect (doacross->array == NULL, 0))
7131debfc3dSmrg {
7141debfc3dSmrg __sync_synchronize ();
7151debfc3dSmrg return;
7161debfc3dSmrg }
7171debfc3dSmrg
7181debfc3dSmrg if (__builtin_expect (ws->sched == GFS_STATIC, 1))
7191debfc3dSmrg {
7201debfc3dSmrg if (ws->chunk_size_ull == 0)
7211debfc3dSmrg {
7221debfc3dSmrg if (first < doacross->boundary_ull)
7231debfc3dSmrg ent = first / (doacross->q_ull + 1);
7241debfc3dSmrg else
7251debfc3dSmrg ent = (first - doacross->boundary_ull) / doacross->q_ull
7261debfc3dSmrg + doacross->t;
7271debfc3dSmrg }
7281debfc3dSmrg else
7291debfc3dSmrg ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
7301debfc3dSmrg }
7311debfc3dSmrg else if (ws->sched == GFS_GUIDED)
7321debfc3dSmrg ent = first;
7331debfc3dSmrg else
7341debfc3dSmrg ent = first / doacross->chunk_size_ull;
7351debfc3dSmrg
7361debfc3dSmrg if (__builtin_expect (doacross->flattened, 1))
7371debfc3dSmrg {
7381debfc3dSmrg unsigned long *array = (unsigned long *) (doacross->array
7391debfc3dSmrg + ent * doacross->elt_sz);
7401debfc3dSmrg gomp_ull flattened = first << doacross->shift_counts[0];
7411debfc3dSmrg unsigned long cur;
7421debfc3dSmrg
7431debfc3dSmrg va_start (ap, first);
7441debfc3dSmrg for (i = 1; i < doacross->ncounts; i++)
7451debfc3dSmrg flattened |= va_arg (ap, gomp_ull)
7461debfc3dSmrg << doacross->shift_counts[i];
7471debfc3dSmrg cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
7481debfc3dSmrg if (flattened < cur)
7491debfc3dSmrg {
7501debfc3dSmrg __atomic_thread_fence (MEMMODEL_RELEASE);
7511debfc3dSmrg va_end (ap);
7521debfc3dSmrg return;
7531debfc3dSmrg }
7541debfc3dSmrg doacross_spin (array, flattened, cur);
7551debfc3dSmrg __atomic_thread_fence (MEMMODEL_RELEASE);
7561debfc3dSmrg va_end (ap);
7571debfc3dSmrg return;
7581debfc3dSmrg }
7591debfc3dSmrg
7601debfc3dSmrg if (sizeof (gomp_ull) == sizeof (unsigned long))
7611debfc3dSmrg {
7621debfc3dSmrg gomp_ull *array = (gomp_ull *) (doacross->array
7631debfc3dSmrg + ent * doacross->elt_sz);
7641debfc3dSmrg do
7651debfc3dSmrg {
7661debfc3dSmrg va_start (ap, first);
7671debfc3dSmrg for (i = 0; i < doacross->ncounts; i++)
7681debfc3dSmrg {
7691debfc3dSmrg gomp_ull thisv
7701debfc3dSmrg = (i ? va_arg (ap, gomp_ull) : first) + 1;
7711debfc3dSmrg gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
7721debfc3dSmrg if (thisv < cur)
7731debfc3dSmrg {
7741debfc3dSmrg i = doacross->ncounts;
7751debfc3dSmrg break;
7761debfc3dSmrg }
7771debfc3dSmrg if (thisv > cur)
7781debfc3dSmrg break;
7791debfc3dSmrg }
7801debfc3dSmrg va_end (ap);
7811debfc3dSmrg if (i == doacross->ncounts)
7821debfc3dSmrg break;
7831debfc3dSmrg cpu_relax ();
7841debfc3dSmrg }
7851debfc3dSmrg while (1);
7861debfc3dSmrg }
7871debfc3dSmrg else
7881debfc3dSmrg {
7891debfc3dSmrg unsigned long *array = (unsigned long *) (doacross->array
7901debfc3dSmrg + ent * doacross->elt_sz);
7911debfc3dSmrg do
7921debfc3dSmrg {
7931debfc3dSmrg va_start (ap, first);
7941debfc3dSmrg for (i = 0; i < doacross->ncounts; i++)
7951debfc3dSmrg {
7961debfc3dSmrg gomp_ull thisv
7971debfc3dSmrg = (i ? va_arg (ap, gomp_ull) : first) + 1;
7981debfc3dSmrg unsigned long t
7991debfc3dSmrg = thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
8001debfc3dSmrg unsigned long cur
8011debfc3dSmrg = __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED);
8021debfc3dSmrg if (t < cur)
8031debfc3dSmrg {
8041debfc3dSmrg i = doacross->ncounts;
8051debfc3dSmrg break;
8061debfc3dSmrg }
8071debfc3dSmrg if (t > cur)
8081debfc3dSmrg break;
8091debfc3dSmrg t = thisv;
8101debfc3dSmrg cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED);
8111debfc3dSmrg if (t < cur)
8121debfc3dSmrg {
8131debfc3dSmrg i = doacross->ncounts;
8141debfc3dSmrg break;
8151debfc3dSmrg }
8161debfc3dSmrg if (t > cur)
8171debfc3dSmrg break;
8181debfc3dSmrg }
8191debfc3dSmrg va_end (ap);
8201debfc3dSmrg if (i == doacross->ncounts)
8211debfc3dSmrg break;
8221debfc3dSmrg cpu_relax ();
8231debfc3dSmrg }
8241debfc3dSmrg while (1);
8251debfc3dSmrg }
8261debfc3dSmrg __sync_synchronize ();
8271debfc3dSmrg }
828