xref: /netbsd-src/external/gpl3/gcc.old/dist/libgomp/ordered.c (revision 8feb0f0b7eaff0608f8350bbfa3098827b4bb91b)
1*8feb0f0bSmrg /* Copyright (C) 2005-2020 Free Software Foundation, Inc.
21debfc3dSmrg    Contributed by Richard Henderson <rth@redhat.com>.
31debfc3dSmrg 
41debfc3dSmrg    This file is part of the GNU Offloading and Multi Processing Library
51debfc3dSmrg    (libgomp).
61debfc3dSmrg 
71debfc3dSmrg    Libgomp is free software; you can redistribute it and/or modify it
81debfc3dSmrg    under the terms of the GNU General Public License as published by
91debfc3dSmrg    the Free Software Foundation; either version 3, or (at your option)
101debfc3dSmrg    any later version.
111debfc3dSmrg 
121debfc3dSmrg    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
131debfc3dSmrg    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
141debfc3dSmrg    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
151debfc3dSmrg    more details.
161debfc3dSmrg 
171debfc3dSmrg    Under Section 7 of GPL version 3, you are granted additional
181debfc3dSmrg    permissions described in the GCC Runtime Library Exception, version
191debfc3dSmrg    3.1, as published by the Free Software Foundation.
201debfc3dSmrg 
211debfc3dSmrg    You should have received a copy of the GNU General Public License and
221debfc3dSmrg    a copy of the GCC Runtime Library Exception along with this program;
231debfc3dSmrg    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
241debfc3dSmrg    <http://www.gnu.org/licenses/>.  */
251debfc3dSmrg 
261debfc3dSmrg /* This file handles the ORDERED construct.  */
271debfc3dSmrg 
281debfc3dSmrg #include "libgomp.h"
291debfc3dSmrg #include <stdarg.h>
301debfc3dSmrg #include <string.h>
311debfc3dSmrg #include "doacross.h"
321debfc3dSmrg 
331debfc3dSmrg 
341debfc3dSmrg /* This function is called when first allocating an iteration block.  That
351debfc3dSmrg    is, the thread is not currently on the queue.  The work-share lock must
361debfc3dSmrg    be held on entry.  */
371debfc3dSmrg 
381debfc3dSmrg void
gomp_ordered_first(void)391debfc3dSmrg gomp_ordered_first (void)
401debfc3dSmrg {
411debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
421debfc3dSmrg   struct gomp_team *team = thr->ts.team;
431debfc3dSmrg   struct gomp_work_share *ws = thr->ts.work_share;
441debfc3dSmrg   unsigned index;
451debfc3dSmrg 
461debfc3dSmrg   /* Work share constructs can be orphaned.  */
471debfc3dSmrg   if (team == NULL || team->nthreads == 1)
481debfc3dSmrg     return;
491debfc3dSmrg 
501debfc3dSmrg   index = ws->ordered_cur + ws->ordered_num_used;
511debfc3dSmrg   if (index >= team->nthreads)
521debfc3dSmrg     index -= team->nthreads;
531debfc3dSmrg   ws->ordered_team_ids[index] = thr->ts.team_id;
541debfc3dSmrg 
551debfc3dSmrg   /* If this is the first and only thread in the queue, then there is
561debfc3dSmrg      no one to release us when we get to our ordered section.  Post to
571debfc3dSmrg      our own release queue now so that we won't block later.  */
581debfc3dSmrg   if (ws->ordered_num_used++ == 0)
591debfc3dSmrg     gomp_sem_post (team->ordered_release[thr->ts.team_id]);
601debfc3dSmrg }
611debfc3dSmrg 
621debfc3dSmrg /* This function is called when completing the last iteration block.  That
631debfc3dSmrg    is, there are no more iterations to perform and so the thread should be
641debfc3dSmrg    removed from the queue entirely.  Because of the way ORDERED blocks are
651debfc3dSmrg    managed, it follows that we currently own access to the ORDERED block,
661debfc3dSmrg    and should now pass it on to the next thread.  The work-share lock must
671debfc3dSmrg    be held on entry.  */
681debfc3dSmrg 
691debfc3dSmrg void
gomp_ordered_last(void)701debfc3dSmrg gomp_ordered_last (void)
711debfc3dSmrg {
721debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
731debfc3dSmrg   struct gomp_team *team = thr->ts.team;
741debfc3dSmrg   struct gomp_work_share *ws = thr->ts.work_share;
751debfc3dSmrg   unsigned next_id;
761debfc3dSmrg 
771debfc3dSmrg   /* Work share constructs can be orphaned.  */
781debfc3dSmrg   if (team == NULL || team->nthreads == 1)
791debfc3dSmrg     return;
801debfc3dSmrg 
811debfc3dSmrg   /* We're no longer the owner.  */
821debfc3dSmrg   ws->ordered_owner = -1;
831debfc3dSmrg 
841debfc3dSmrg   /* If we're not the last thread in the queue, then wake the next.  */
851debfc3dSmrg   if (--ws->ordered_num_used > 0)
861debfc3dSmrg     {
871debfc3dSmrg       unsigned next = ws->ordered_cur + 1;
881debfc3dSmrg       if (next == team->nthreads)
891debfc3dSmrg 	next = 0;
901debfc3dSmrg       ws->ordered_cur = next;
911debfc3dSmrg 
921debfc3dSmrg       next_id = ws->ordered_team_ids[next];
931debfc3dSmrg       gomp_sem_post (team->ordered_release[next_id]);
941debfc3dSmrg     }
951debfc3dSmrg }
961debfc3dSmrg 
971debfc3dSmrg 
981debfc3dSmrg /* This function is called when allocating a subsequent allocation block.
991debfc3dSmrg    That is, we're done with the current iteration block and we're allocating
1001debfc3dSmrg    another.  This is the logical combination of a call to gomp_ordered_last
1011debfc3dSmrg    followed by a call to gomp_ordered_first.  The work-share lock must be
1021debfc3dSmrg    held on entry. */
1031debfc3dSmrg 
1041debfc3dSmrg void
gomp_ordered_next(void)1051debfc3dSmrg gomp_ordered_next (void)
1061debfc3dSmrg {
1071debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
1081debfc3dSmrg   struct gomp_team *team = thr->ts.team;
1091debfc3dSmrg   struct gomp_work_share *ws = thr->ts.work_share;
1101debfc3dSmrg   unsigned index, next_id;
1111debfc3dSmrg 
1121debfc3dSmrg   /* Work share constructs can be orphaned.  */
1131debfc3dSmrg   if (team == NULL || team->nthreads == 1)
1141debfc3dSmrg     return;
1151debfc3dSmrg 
1161debfc3dSmrg   /* We're no longer the owner.  */
1171debfc3dSmrg   ws->ordered_owner = -1;
1181debfc3dSmrg 
1191debfc3dSmrg   /* If there's only one thread in the queue, that must be us.  */
1201debfc3dSmrg   if (ws->ordered_num_used == 1)
1211debfc3dSmrg     {
1221debfc3dSmrg       /* We have a similar situation as in gomp_ordered_first
1231debfc3dSmrg 	 where we need to post to our own release semaphore.  */
1241debfc3dSmrg       gomp_sem_post (team->ordered_release[thr->ts.team_id]);
1251debfc3dSmrg       return;
1261debfc3dSmrg     }
1271debfc3dSmrg 
1281debfc3dSmrg   /* If the queue is entirely full, then we move ourself to the end of
1291debfc3dSmrg      the queue merely by incrementing ordered_cur.  Only if it's not
1301debfc3dSmrg      full do we have to write our id.  */
1311debfc3dSmrg   if (ws->ordered_num_used < team->nthreads)
1321debfc3dSmrg     {
1331debfc3dSmrg       index = ws->ordered_cur + ws->ordered_num_used;
1341debfc3dSmrg       if (index >= team->nthreads)
1351debfc3dSmrg 	index -= team->nthreads;
1361debfc3dSmrg       ws->ordered_team_ids[index] = thr->ts.team_id;
1371debfc3dSmrg     }
1381debfc3dSmrg 
1391debfc3dSmrg   index = ws->ordered_cur + 1;
1401debfc3dSmrg   if (index == team->nthreads)
1411debfc3dSmrg     index = 0;
1421debfc3dSmrg   ws->ordered_cur = index;
1431debfc3dSmrg 
1441debfc3dSmrg   next_id = ws->ordered_team_ids[index];
1451debfc3dSmrg   gomp_sem_post (team->ordered_release[next_id]);
1461debfc3dSmrg }
1471debfc3dSmrg 
1481debfc3dSmrg 
1491debfc3dSmrg /* This function is called when a statically scheduled loop is first
1501debfc3dSmrg    being created.  */
1511debfc3dSmrg 
1521debfc3dSmrg void
gomp_ordered_static_init(void)1531debfc3dSmrg gomp_ordered_static_init (void)
1541debfc3dSmrg {
1551debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
1561debfc3dSmrg   struct gomp_team *team = thr->ts.team;
1571debfc3dSmrg 
1581debfc3dSmrg   if (team == NULL || team->nthreads == 1)
1591debfc3dSmrg     return;
1601debfc3dSmrg 
1611debfc3dSmrg   gomp_sem_post (team->ordered_release[0]);
1621debfc3dSmrg }
1631debfc3dSmrg 
1641debfc3dSmrg /* This function is called when a statically scheduled loop is moving to
1651debfc3dSmrg    the next allocation block.  Static schedules are not first come first
1661debfc3dSmrg    served like the others, so we're to move to the numerically next thread,
1671debfc3dSmrg    not the next thread on a list.  The work-share lock should *not* be held
1681debfc3dSmrg    on entry.  */
1691debfc3dSmrg 
1701debfc3dSmrg void
gomp_ordered_static_next(void)1711debfc3dSmrg gomp_ordered_static_next (void)
1721debfc3dSmrg {
1731debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
1741debfc3dSmrg   struct gomp_team *team = thr->ts.team;
1751debfc3dSmrg   struct gomp_work_share *ws = thr->ts.work_share;
1761debfc3dSmrg   unsigned id = thr->ts.team_id;
1771debfc3dSmrg 
1781debfc3dSmrg   if (team == NULL || team->nthreads == 1)
1791debfc3dSmrg     return;
1801debfc3dSmrg 
1811debfc3dSmrg   ws->ordered_owner = -1;
1821debfc3dSmrg 
1831debfc3dSmrg   /* This thread currently owns the lock.  Increment the owner.  */
1841debfc3dSmrg   if (++id == team->nthreads)
1851debfc3dSmrg     id = 0;
1861debfc3dSmrg   ws->ordered_team_ids[0] = id;
1871debfc3dSmrg   gomp_sem_post (team->ordered_release[id]);
1881debfc3dSmrg }
1891debfc3dSmrg 
1901debfc3dSmrg /* This function is called when we need to assert that the thread owns the
1911debfc3dSmrg    ordered section.  Due to the problem of posted-but-not-waited semaphores,
1921debfc3dSmrg    this needs to happen before completing a loop iteration.  */
1931debfc3dSmrg 
1941debfc3dSmrg void
gomp_ordered_sync(void)1951debfc3dSmrg gomp_ordered_sync (void)
1961debfc3dSmrg {
1971debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
1981debfc3dSmrg   struct gomp_team *team = thr->ts.team;
1991debfc3dSmrg   struct gomp_work_share *ws = thr->ts.work_share;
2001debfc3dSmrg 
2011debfc3dSmrg   /* Work share constructs can be orphaned.  But this clearly means that
2021debfc3dSmrg      we are the only thread, and so we automatically own the section.  */
2031debfc3dSmrg   if (team == NULL || team->nthreads == 1)
2041debfc3dSmrg     return;
2051debfc3dSmrg 
2061debfc3dSmrg   /* ??? I believe it to be safe to access this data without taking the
2071debfc3dSmrg      ws->lock.  The only presumed race condition is with the previous
2081debfc3dSmrg      thread on the queue incrementing ordered_cur such that it points
2091debfc3dSmrg      to us, concurrently with our check below.  But our team_id is
2101debfc3dSmrg      already present in the queue, and the other thread will always
2111debfc3dSmrg      post to our release semaphore.  So the two cases are that we will
2121debfc3dSmrg      either win the race an momentarily block on the semaphore, or lose
2131debfc3dSmrg      the race and find the semaphore already unlocked and so not block.
2141debfc3dSmrg      Either way we get correct results.
2151debfc3dSmrg      However, there is an implicit flush on entry to an ordered region,
2161debfc3dSmrg      so we do need to have a barrier here.  If we were taking a lock
217*8feb0f0bSmrg      this could be MEMMODEL_RELEASE since the acquire would be covered
2181debfc3dSmrg      by the lock.  */
2191debfc3dSmrg 
2201debfc3dSmrg   __atomic_thread_fence (MEMMODEL_ACQ_REL);
2211debfc3dSmrg   if (ws->ordered_owner != thr->ts.team_id)
2221debfc3dSmrg     {
2231debfc3dSmrg       gomp_sem_wait (team->ordered_release[thr->ts.team_id]);
2241debfc3dSmrg       ws->ordered_owner = thr->ts.team_id;
2251debfc3dSmrg     }
2261debfc3dSmrg }
2271debfc3dSmrg 
2281debfc3dSmrg /* This function is called by user code when encountering the start of an
2291debfc3dSmrg    ORDERED block.  We must check to see if the current thread is at the
2301debfc3dSmrg    head of the queue, and if not, block.  */
2311debfc3dSmrg 
2321debfc3dSmrg #ifdef HAVE_ATTRIBUTE_ALIAS
2331debfc3dSmrg extern void GOMP_ordered_start (void)
2341debfc3dSmrg 	__attribute__((alias ("gomp_ordered_sync")));
2351debfc3dSmrg #else
2361debfc3dSmrg void
GOMP_ordered_start(void)2371debfc3dSmrg GOMP_ordered_start (void)
2381debfc3dSmrg {
2391debfc3dSmrg   gomp_ordered_sync ();
2401debfc3dSmrg }
2411debfc3dSmrg #endif
2421debfc3dSmrg 
2431debfc3dSmrg /* This function is called by user code when encountering the end of an
2441debfc3dSmrg    ORDERED block.  With the current ORDERED implementation there's nothing
2451debfc3dSmrg    for us to do.
2461debfc3dSmrg 
2471debfc3dSmrg    However, the current implementation has a flaw in that it does not allow
2481debfc3dSmrg    the next thread into the ORDERED section immediately after the current
249*8feb0f0bSmrg    thread exits the ORDERED section in its last iteration.  The existence
2501debfc3dSmrg    of this function allows the implementation to change.  */
2511debfc3dSmrg 
2521debfc3dSmrg void
GOMP_ordered_end(void)2531debfc3dSmrg GOMP_ordered_end (void)
2541debfc3dSmrg {
2551debfc3dSmrg }
2561debfc3dSmrg 
2571debfc3dSmrg /* DOACROSS initialization.  */
2581debfc3dSmrg 
2591debfc3dSmrg #define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
2601debfc3dSmrg 
2611debfc3dSmrg void
gomp_doacross_init(unsigned ncounts,long * counts,long chunk_size,size_t extra)262c0a68be4Smrg gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size,
263c0a68be4Smrg 		    size_t extra)
2641debfc3dSmrg {
2651debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
2661debfc3dSmrg   struct gomp_team *team = thr->ts.team;
2671debfc3dSmrg   struct gomp_work_share *ws = thr->ts.work_share;
2681debfc3dSmrg   unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
2691debfc3dSmrg   unsigned long ent, num_ents, elt_sz, shift_sz;
2701debfc3dSmrg   struct gomp_doacross_work_share *doacross;
2711debfc3dSmrg 
2721debfc3dSmrg   if (team == NULL || team->nthreads == 1)
273c0a68be4Smrg     {
274c0a68be4Smrg     empty:
275c0a68be4Smrg       if (!extra)
276c0a68be4Smrg 	ws->doacross = NULL;
277c0a68be4Smrg       else
278c0a68be4Smrg 	{
279c0a68be4Smrg 	  doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
280c0a68be4Smrg 	  doacross->extra = (void *) (doacross + 1);
281c0a68be4Smrg 	  ws->doacross = doacross;
282c0a68be4Smrg 	}
2831debfc3dSmrg       return;
284c0a68be4Smrg     }
2851debfc3dSmrg 
2861debfc3dSmrg   for (i = 0; i < ncounts; i++)
2871debfc3dSmrg     {
2881debfc3dSmrg       /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
2891debfc3dSmrg       if (counts[i] == 0)
290c0a68be4Smrg 	goto empty;
2911debfc3dSmrg 
2921debfc3dSmrg       if (num_bits <= MAX_COLLAPSED_BITS)
2931debfc3dSmrg 	{
2941debfc3dSmrg 	  unsigned int this_bits;
2951debfc3dSmrg 	  if (counts[i] == 1)
2961debfc3dSmrg 	    this_bits = 1;
2971debfc3dSmrg 	  else
2981debfc3dSmrg 	    this_bits = __SIZEOF_LONG__ * __CHAR_BIT__
2991debfc3dSmrg 			- __builtin_clzl (counts[i] - 1);
3001debfc3dSmrg 	  if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
3011debfc3dSmrg 	    {
3021debfc3dSmrg 	      bits[i] = this_bits;
3031debfc3dSmrg 	      num_bits += this_bits;
3041debfc3dSmrg 	    }
3051debfc3dSmrg 	  else
3061debfc3dSmrg 	    num_bits = MAX_COLLAPSED_BITS + 1;
3071debfc3dSmrg 	}
3081debfc3dSmrg     }
3091debfc3dSmrg 
3101debfc3dSmrg   if (ws->sched == GFS_STATIC)
3111debfc3dSmrg     num_ents = team->nthreads;
3121debfc3dSmrg   else if (ws->sched == GFS_GUIDED)
3131debfc3dSmrg     num_ents = counts[0];
3141debfc3dSmrg   else
3151debfc3dSmrg     num_ents = (counts[0] - 1) / chunk_size + 1;
3161debfc3dSmrg   if (num_bits <= MAX_COLLAPSED_BITS)
3171debfc3dSmrg     {
3181debfc3dSmrg       elt_sz = sizeof (unsigned long);
3191debfc3dSmrg       shift_sz = ncounts * sizeof (unsigned int);
3201debfc3dSmrg     }
3211debfc3dSmrg   else
3221debfc3dSmrg     {
3231debfc3dSmrg       elt_sz = sizeof (unsigned long) * ncounts;
3241debfc3dSmrg       shift_sz = 0;
3251debfc3dSmrg     }
3261debfc3dSmrg   elt_sz = (elt_sz + 63) & ~63UL;
3271debfc3dSmrg 
3281debfc3dSmrg   doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
329c0a68be4Smrg 			  + shift_sz + extra);
3301debfc3dSmrg   doacross->chunk_size = chunk_size;
3311debfc3dSmrg   doacross->elt_sz = elt_sz;
3321debfc3dSmrg   doacross->ncounts = ncounts;
3331debfc3dSmrg   doacross->flattened = false;
3341debfc3dSmrg   doacross->array = (unsigned char *)
3351debfc3dSmrg 		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
3361debfc3dSmrg 		     & ~(uintptr_t) 63);
337c0a68be4Smrg   if (extra)
338c0a68be4Smrg     {
339c0a68be4Smrg       doacross->extra = doacross->array + num_ents * elt_sz;
340c0a68be4Smrg       memset (doacross->extra, '\0', extra);
341c0a68be4Smrg     }
342c0a68be4Smrg   else
343c0a68be4Smrg     doacross->extra = NULL;
3441debfc3dSmrg   if (num_bits <= MAX_COLLAPSED_BITS)
3451debfc3dSmrg     {
3461debfc3dSmrg       unsigned int shift_count = 0;
3471debfc3dSmrg       doacross->flattened = true;
3481debfc3dSmrg       for (i = ncounts; i > 0; i--)
3491debfc3dSmrg 	{
3501debfc3dSmrg 	  doacross->shift_counts[i - 1] = shift_count;
3511debfc3dSmrg 	  shift_count += bits[i - 1];
3521debfc3dSmrg 	}
3531debfc3dSmrg       for (ent = 0; ent < num_ents; ent++)
3541debfc3dSmrg 	*(unsigned long *) (doacross->array + ent * elt_sz) = 0;
3551debfc3dSmrg     }
3561debfc3dSmrg   else
3571debfc3dSmrg     for (ent = 0; ent < num_ents; ent++)
3581debfc3dSmrg       memset (doacross->array + ent * elt_sz, '\0',
3591debfc3dSmrg 	      sizeof (unsigned long) * ncounts);
3601debfc3dSmrg   if (ws->sched == GFS_STATIC && chunk_size == 0)
3611debfc3dSmrg     {
3621debfc3dSmrg       unsigned long q = counts[0] / num_ents;
3631debfc3dSmrg       unsigned long t = counts[0] % num_ents;
3641debfc3dSmrg       doacross->boundary = t * (q + 1);
3651debfc3dSmrg       doacross->q = q;
3661debfc3dSmrg       doacross->t = t;
3671debfc3dSmrg     }
3681debfc3dSmrg   ws->doacross = doacross;
3691debfc3dSmrg }
3701debfc3dSmrg 
3711debfc3dSmrg /* DOACROSS POST operation.  */
3721debfc3dSmrg 
3731debfc3dSmrg void
GOMP_doacross_post(long * counts)3741debfc3dSmrg GOMP_doacross_post (long *counts)
3751debfc3dSmrg {
3761debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
3771debfc3dSmrg   struct gomp_work_share *ws = thr->ts.work_share;
3781debfc3dSmrg   struct gomp_doacross_work_share *doacross = ws->doacross;
3791debfc3dSmrg   unsigned long ent;
3801debfc3dSmrg   unsigned int i;
3811debfc3dSmrg 
382c0a68be4Smrg   if (__builtin_expect (doacross == NULL, 0)
383c0a68be4Smrg       || __builtin_expect (doacross->array == NULL, 0))
3841debfc3dSmrg     {
3851debfc3dSmrg       __sync_synchronize ();
3861debfc3dSmrg       return;
3871debfc3dSmrg     }
3881debfc3dSmrg 
3891debfc3dSmrg   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
3901debfc3dSmrg     ent = thr->ts.team_id;
3911debfc3dSmrg   else if (ws->sched == GFS_GUIDED)
3921debfc3dSmrg     ent = counts[0];
3931debfc3dSmrg   else
3941debfc3dSmrg     ent = counts[0] / doacross->chunk_size;
3951debfc3dSmrg   unsigned long *array = (unsigned long *) (doacross->array
3961debfc3dSmrg 					    + ent * doacross->elt_sz);
3971debfc3dSmrg 
3981debfc3dSmrg   if (__builtin_expect (doacross->flattened, 1))
3991debfc3dSmrg     {
4001debfc3dSmrg       unsigned long flattened
4011debfc3dSmrg 	= (unsigned long) counts[0] << doacross->shift_counts[0];
4021debfc3dSmrg 
4031debfc3dSmrg       for (i = 1; i < doacross->ncounts; i++)
4041debfc3dSmrg 	flattened |= (unsigned long) counts[i]
4051debfc3dSmrg 		     << doacross->shift_counts[i];
4061debfc3dSmrg       flattened++;
4071debfc3dSmrg       if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
4081debfc3dSmrg 	__atomic_thread_fence (MEMMODEL_RELEASE);
4091debfc3dSmrg       else
4101debfc3dSmrg 	__atomic_store_n (array, flattened, MEMMODEL_RELEASE);
4111debfc3dSmrg       return;
4121debfc3dSmrg     }
4131debfc3dSmrg 
4141debfc3dSmrg   __atomic_thread_fence (MEMMODEL_ACQUIRE);
4151debfc3dSmrg   for (i = doacross->ncounts; i-- > 0; )
4161debfc3dSmrg     {
4171debfc3dSmrg       if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
4181debfc3dSmrg 	__atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
4191debfc3dSmrg     }
4201debfc3dSmrg }
4211debfc3dSmrg 
4221debfc3dSmrg /* DOACROSS WAIT operation.  */
4231debfc3dSmrg 
4241debfc3dSmrg void
GOMP_doacross_wait(long first,...)4251debfc3dSmrg GOMP_doacross_wait (long first, ...)
4261debfc3dSmrg {
4271debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
4281debfc3dSmrg   struct gomp_work_share *ws = thr->ts.work_share;
4291debfc3dSmrg   struct gomp_doacross_work_share *doacross = ws->doacross;
4301debfc3dSmrg   va_list ap;
4311debfc3dSmrg   unsigned long ent;
4321debfc3dSmrg   unsigned int i;
4331debfc3dSmrg 
434c0a68be4Smrg   if (__builtin_expect (doacross == NULL, 0)
435c0a68be4Smrg       || __builtin_expect (doacross->array == NULL, 0))
4361debfc3dSmrg     {
4371debfc3dSmrg       __sync_synchronize ();
4381debfc3dSmrg       return;
4391debfc3dSmrg     }
4401debfc3dSmrg 
4411debfc3dSmrg   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
4421debfc3dSmrg     {
4431debfc3dSmrg       if (ws->chunk_size == 0)
4441debfc3dSmrg 	{
4451debfc3dSmrg 	  if (first < doacross->boundary)
4461debfc3dSmrg 	    ent = first / (doacross->q + 1);
4471debfc3dSmrg 	  else
4481debfc3dSmrg 	    ent = (first - doacross->boundary) / doacross->q
4491debfc3dSmrg 		  + doacross->t;
4501debfc3dSmrg 	}
4511debfc3dSmrg       else
4521debfc3dSmrg 	ent = first / ws->chunk_size % thr->ts.team->nthreads;
4531debfc3dSmrg     }
4541debfc3dSmrg   else if (ws->sched == GFS_GUIDED)
4551debfc3dSmrg     ent = first;
4561debfc3dSmrg   else
4571debfc3dSmrg     ent = first / doacross->chunk_size;
4581debfc3dSmrg   unsigned long *array = (unsigned long *) (doacross->array
4591debfc3dSmrg 					    + ent * doacross->elt_sz);
4601debfc3dSmrg 
4611debfc3dSmrg   if (__builtin_expect (doacross->flattened, 1))
4621debfc3dSmrg     {
4631debfc3dSmrg       unsigned long flattened
4641debfc3dSmrg 	= (unsigned long) first << doacross->shift_counts[0];
4651debfc3dSmrg       unsigned long cur;
4661debfc3dSmrg 
4671debfc3dSmrg       va_start (ap, first);
4681debfc3dSmrg       for (i = 1; i < doacross->ncounts; i++)
4691debfc3dSmrg 	flattened |= (unsigned long) va_arg (ap, long)
4701debfc3dSmrg 		     << doacross->shift_counts[i];
4711debfc3dSmrg       cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
4721debfc3dSmrg       if (flattened < cur)
4731debfc3dSmrg 	{
4741debfc3dSmrg 	  __atomic_thread_fence (MEMMODEL_RELEASE);
4751debfc3dSmrg 	  va_end (ap);
4761debfc3dSmrg 	  return;
4771debfc3dSmrg 	}
4781debfc3dSmrg       doacross_spin (array, flattened, cur);
4791debfc3dSmrg       __atomic_thread_fence (MEMMODEL_RELEASE);
4801debfc3dSmrg       va_end (ap);
4811debfc3dSmrg       return;
4821debfc3dSmrg     }
4831debfc3dSmrg 
4841debfc3dSmrg   do
4851debfc3dSmrg     {
4861debfc3dSmrg       va_start (ap, first);
4871debfc3dSmrg       for (i = 0; i < doacross->ncounts; i++)
4881debfc3dSmrg 	{
4891debfc3dSmrg 	  unsigned long thisv
4901debfc3dSmrg 	    = (unsigned long) (i ? va_arg (ap, long) : first) + 1;
4911debfc3dSmrg 	  unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
4921debfc3dSmrg 	  if (thisv < cur)
4931debfc3dSmrg 	    {
4941debfc3dSmrg 	      i = doacross->ncounts;
4951debfc3dSmrg 	      break;
4961debfc3dSmrg 	    }
4971debfc3dSmrg 	  if (thisv > cur)
4981debfc3dSmrg 	    break;
4991debfc3dSmrg 	}
5001debfc3dSmrg       va_end (ap);
5011debfc3dSmrg       if (i == doacross->ncounts)
5021debfc3dSmrg 	break;
5031debfc3dSmrg       cpu_relax ();
5041debfc3dSmrg     }
5051debfc3dSmrg   while (1);
5061debfc3dSmrg   __sync_synchronize ();
5071debfc3dSmrg }
5081debfc3dSmrg 
5091debfc3dSmrg typedef unsigned long long gomp_ull;
5101debfc3dSmrg 
5111debfc3dSmrg void
gomp_doacross_ull_init(unsigned ncounts,gomp_ull * counts,gomp_ull chunk_size,size_t extra)512c0a68be4Smrg gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts,
513c0a68be4Smrg 			gomp_ull chunk_size, size_t extra)
5141debfc3dSmrg {
5151debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
5161debfc3dSmrg   struct gomp_team *team = thr->ts.team;
5171debfc3dSmrg   struct gomp_work_share *ws = thr->ts.work_share;
5181debfc3dSmrg   unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
5191debfc3dSmrg   unsigned long ent, num_ents, elt_sz, shift_sz;
5201debfc3dSmrg   struct gomp_doacross_work_share *doacross;
5211debfc3dSmrg 
5221debfc3dSmrg   if (team == NULL || team->nthreads == 1)
523c0a68be4Smrg     {
524c0a68be4Smrg     empty:
525c0a68be4Smrg       if (!extra)
526c0a68be4Smrg 	ws->doacross = NULL;
527c0a68be4Smrg       else
528c0a68be4Smrg 	{
529c0a68be4Smrg 	  doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
530c0a68be4Smrg 	  doacross->extra = (void *) (doacross + 1);
531c0a68be4Smrg 	  ws->doacross = doacross;
532c0a68be4Smrg 	}
5331debfc3dSmrg       return;
534c0a68be4Smrg     }
5351debfc3dSmrg 
5361debfc3dSmrg   for (i = 0; i < ncounts; i++)
5371debfc3dSmrg     {
5381debfc3dSmrg       /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
5391debfc3dSmrg       if (counts[i] == 0)
540c0a68be4Smrg 	goto empty;
5411debfc3dSmrg 
5421debfc3dSmrg       if (num_bits <= MAX_COLLAPSED_BITS)
5431debfc3dSmrg 	{
5441debfc3dSmrg 	  unsigned int this_bits;
5451debfc3dSmrg 	  if (counts[i] == 1)
5461debfc3dSmrg 	    this_bits = 1;
5471debfc3dSmrg 	  else
5481debfc3dSmrg 	    this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__
5491debfc3dSmrg 			- __builtin_clzll (counts[i] - 1);
5501debfc3dSmrg 	  if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
5511debfc3dSmrg 	    {
5521debfc3dSmrg 	      bits[i] = this_bits;
5531debfc3dSmrg 	      num_bits += this_bits;
5541debfc3dSmrg 	    }
5551debfc3dSmrg 	  else
5561debfc3dSmrg 	    num_bits = MAX_COLLAPSED_BITS + 1;
5571debfc3dSmrg 	}
5581debfc3dSmrg     }
5591debfc3dSmrg 
5601debfc3dSmrg   if (ws->sched == GFS_STATIC)
5611debfc3dSmrg     num_ents = team->nthreads;
5621debfc3dSmrg   else if (ws->sched == GFS_GUIDED)
5631debfc3dSmrg     num_ents = counts[0];
5641debfc3dSmrg   else
5651debfc3dSmrg     num_ents = (counts[0] - 1) / chunk_size + 1;
5661debfc3dSmrg   if (num_bits <= MAX_COLLAPSED_BITS)
5671debfc3dSmrg     {
5681debfc3dSmrg       elt_sz = sizeof (unsigned long);
5691debfc3dSmrg       shift_sz = ncounts * sizeof (unsigned int);
5701debfc3dSmrg     }
5711debfc3dSmrg   else
5721debfc3dSmrg     {
5731debfc3dSmrg       if (sizeof (gomp_ull) == sizeof (unsigned long))
5741debfc3dSmrg 	elt_sz = sizeof (gomp_ull) * ncounts;
5751debfc3dSmrg       else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long))
5761debfc3dSmrg 	elt_sz = sizeof (unsigned long) * 2 * ncounts;
5771debfc3dSmrg       else
5781debfc3dSmrg 	abort ();
5791debfc3dSmrg       shift_sz = 0;
5801debfc3dSmrg     }
5811debfc3dSmrg   elt_sz = (elt_sz + 63) & ~63UL;
5821debfc3dSmrg 
5831debfc3dSmrg   doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
5841debfc3dSmrg 			  + shift_sz);
5851debfc3dSmrg   doacross->chunk_size_ull = chunk_size;
5861debfc3dSmrg   doacross->elt_sz = elt_sz;
5871debfc3dSmrg   doacross->ncounts = ncounts;
5881debfc3dSmrg   doacross->flattened = false;
5891debfc3dSmrg   doacross->boundary = 0;
5901debfc3dSmrg   doacross->array = (unsigned char *)
5911debfc3dSmrg 		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
5921debfc3dSmrg 		     & ~(uintptr_t) 63);
593c0a68be4Smrg   if (extra)
594c0a68be4Smrg     {
595c0a68be4Smrg       doacross->extra = doacross->array + num_ents * elt_sz;
596c0a68be4Smrg       memset (doacross->extra, '\0', extra);
597c0a68be4Smrg     }
598c0a68be4Smrg   else
599c0a68be4Smrg     doacross->extra = NULL;
6001debfc3dSmrg   if (num_bits <= MAX_COLLAPSED_BITS)
6011debfc3dSmrg     {
6021debfc3dSmrg       unsigned int shift_count = 0;
6031debfc3dSmrg       doacross->flattened = true;
6041debfc3dSmrg       for (i = ncounts; i > 0; i--)
6051debfc3dSmrg 	{
6061debfc3dSmrg 	  doacross->shift_counts[i - 1] = shift_count;
6071debfc3dSmrg 	  shift_count += bits[i - 1];
6081debfc3dSmrg 	}
6091debfc3dSmrg       for (ent = 0; ent < num_ents; ent++)
6101debfc3dSmrg 	*(unsigned long *) (doacross->array + ent * elt_sz) = 0;
6111debfc3dSmrg     }
6121debfc3dSmrg   else
6131debfc3dSmrg     for (ent = 0; ent < num_ents; ent++)
6141debfc3dSmrg       memset (doacross->array + ent * elt_sz, '\0',
6151debfc3dSmrg 	      sizeof (unsigned long) * ncounts);
6161debfc3dSmrg   if (ws->sched == GFS_STATIC && chunk_size == 0)
6171debfc3dSmrg     {
6181debfc3dSmrg       gomp_ull q = counts[0] / num_ents;
6191debfc3dSmrg       gomp_ull t = counts[0] % num_ents;
6201debfc3dSmrg       doacross->boundary_ull = t * (q + 1);
6211debfc3dSmrg       doacross->q_ull = q;
6221debfc3dSmrg       doacross->t = t;
6231debfc3dSmrg     }
6241debfc3dSmrg   ws->doacross = doacross;
6251debfc3dSmrg }
6261debfc3dSmrg 
6271debfc3dSmrg /* DOACROSS POST operation.  */
6281debfc3dSmrg 
6291debfc3dSmrg void
GOMP_doacross_ull_post(gomp_ull * counts)6301debfc3dSmrg GOMP_doacross_ull_post (gomp_ull *counts)
6311debfc3dSmrg {
6321debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
6331debfc3dSmrg   struct gomp_work_share *ws = thr->ts.work_share;
6341debfc3dSmrg   struct gomp_doacross_work_share *doacross = ws->doacross;
6351debfc3dSmrg   unsigned long ent;
6361debfc3dSmrg   unsigned int i;
6371debfc3dSmrg 
638c0a68be4Smrg   if (__builtin_expect (doacross == NULL, 0)
639c0a68be4Smrg       || __builtin_expect (doacross->array == NULL, 0))
6401debfc3dSmrg     {
6411debfc3dSmrg       __sync_synchronize ();
6421debfc3dSmrg       return;
6431debfc3dSmrg     }
6441debfc3dSmrg 
6451debfc3dSmrg   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
6461debfc3dSmrg     ent = thr->ts.team_id;
6471debfc3dSmrg   else if (ws->sched == GFS_GUIDED)
6481debfc3dSmrg     ent = counts[0];
6491debfc3dSmrg   else
6501debfc3dSmrg     ent = counts[0] / doacross->chunk_size_ull;
6511debfc3dSmrg 
6521debfc3dSmrg   if (__builtin_expect (doacross->flattened, 1))
6531debfc3dSmrg     {
6541debfc3dSmrg       unsigned long *array = (unsigned long *) (doacross->array
6551debfc3dSmrg 			      + ent * doacross->elt_sz);
6561debfc3dSmrg       gomp_ull flattened
6571debfc3dSmrg 	= counts[0] << doacross->shift_counts[0];
6581debfc3dSmrg 
6591debfc3dSmrg       for (i = 1; i < doacross->ncounts; i++)
6601debfc3dSmrg 	flattened |= counts[i] << doacross->shift_counts[i];
6611debfc3dSmrg       flattened++;
6621debfc3dSmrg       if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
6631debfc3dSmrg 	__atomic_thread_fence (MEMMODEL_RELEASE);
6641debfc3dSmrg       else
6651debfc3dSmrg 	__atomic_store_n (array, flattened, MEMMODEL_RELEASE);
6661debfc3dSmrg       return;
6671debfc3dSmrg     }
6681debfc3dSmrg 
6691debfc3dSmrg   __atomic_thread_fence (MEMMODEL_ACQUIRE);
6701debfc3dSmrg   if (sizeof (gomp_ull) == sizeof (unsigned long))
6711debfc3dSmrg     {
6721debfc3dSmrg       gomp_ull *array = (gomp_ull *) (doacross->array
6731debfc3dSmrg 				      + ent * doacross->elt_sz);
6741debfc3dSmrg 
6751debfc3dSmrg       for (i = doacross->ncounts; i-- > 0; )
6761debfc3dSmrg 	{
6771debfc3dSmrg 	  if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
6781debfc3dSmrg 	    __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
6791debfc3dSmrg 	}
6801debfc3dSmrg     }
6811debfc3dSmrg   else
6821debfc3dSmrg     {
6831debfc3dSmrg       unsigned long *array = (unsigned long *) (doacross->array
6841debfc3dSmrg 						+ ent * doacross->elt_sz);
6851debfc3dSmrg 
6861debfc3dSmrg       for (i = doacross->ncounts; i-- > 0; )
6871debfc3dSmrg 	{
6881debfc3dSmrg 	  gomp_ull cull = counts[i] + 1UL;
6891debfc3dSmrg 	  unsigned long c = (unsigned long) cull;
6901debfc3dSmrg 	  if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED))
6911debfc3dSmrg 	    __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE);
6921debfc3dSmrg 	  c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
6931debfc3dSmrg 	  if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED))
6941debfc3dSmrg 	    __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE);
6951debfc3dSmrg 	}
6961debfc3dSmrg     }
6971debfc3dSmrg }
6981debfc3dSmrg 
6991debfc3dSmrg /* DOACROSS WAIT operation.  */
7001debfc3dSmrg 
7011debfc3dSmrg void
GOMP_doacross_ull_wait(gomp_ull first,...)7021debfc3dSmrg GOMP_doacross_ull_wait (gomp_ull first, ...)
7031debfc3dSmrg {
7041debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
7051debfc3dSmrg   struct gomp_work_share *ws = thr->ts.work_share;
7061debfc3dSmrg   struct gomp_doacross_work_share *doacross = ws->doacross;
7071debfc3dSmrg   va_list ap;
7081debfc3dSmrg   unsigned long ent;
7091debfc3dSmrg   unsigned int i;
7101debfc3dSmrg 
711c0a68be4Smrg   if (__builtin_expect (doacross == NULL, 0)
712c0a68be4Smrg       || __builtin_expect (doacross->array == NULL, 0))
7131debfc3dSmrg     {
7141debfc3dSmrg       __sync_synchronize ();
7151debfc3dSmrg       return;
7161debfc3dSmrg     }
7171debfc3dSmrg 
7181debfc3dSmrg   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
7191debfc3dSmrg     {
7201debfc3dSmrg       if (ws->chunk_size_ull == 0)
7211debfc3dSmrg 	{
7221debfc3dSmrg 	  if (first < doacross->boundary_ull)
7231debfc3dSmrg 	    ent = first / (doacross->q_ull + 1);
7241debfc3dSmrg 	  else
7251debfc3dSmrg 	    ent = (first - doacross->boundary_ull) / doacross->q_ull
7261debfc3dSmrg 		  + doacross->t;
7271debfc3dSmrg 	}
7281debfc3dSmrg       else
7291debfc3dSmrg 	ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
7301debfc3dSmrg     }
7311debfc3dSmrg   else if (ws->sched == GFS_GUIDED)
7321debfc3dSmrg     ent = first;
7331debfc3dSmrg   else
7341debfc3dSmrg     ent = first / doacross->chunk_size_ull;
7351debfc3dSmrg 
7361debfc3dSmrg   if (__builtin_expect (doacross->flattened, 1))
7371debfc3dSmrg     {
7381debfc3dSmrg       unsigned long *array = (unsigned long *) (doacross->array
7391debfc3dSmrg 						+ ent * doacross->elt_sz);
7401debfc3dSmrg       gomp_ull flattened = first << doacross->shift_counts[0];
7411debfc3dSmrg       unsigned long cur;
7421debfc3dSmrg 
7431debfc3dSmrg       va_start (ap, first);
7441debfc3dSmrg       for (i = 1; i < doacross->ncounts; i++)
7451debfc3dSmrg 	flattened |= va_arg (ap, gomp_ull)
7461debfc3dSmrg 		     << doacross->shift_counts[i];
7471debfc3dSmrg       cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
7481debfc3dSmrg       if (flattened < cur)
7491debfc3dSmrg 	{
7501debfc3dSmrg 	  __atomic_thread_fence (MEMMODEL_RELEASE);
7511debfc3dSmrg 	  va_end (ap);
7521debfc3dSmrg 	  return;
7531debfc3dSmrg 	}
7541debfc3dSmrg       doacross_spin (array, flattened, cur);
7551debfc3dSmrg       __atomic_thread_fence (MEMMODEL_RELEASE);
7561debfc3dSmrg       va_end (ap);
7571debfc3dSmrg       return;
7581debfc3dSmrg     }
7591debfc3dSmrg 
7601debfc3dSmrg   if (sizeof (gomp_ull) == sizeof (unsigned long))
7611debfc3dSmrg     {
7621debfc3dSmrg       gomp_ull *array = (gomp_ull *) (doacross->array
7631debfc3dSmrg 				      + ent * doacross->elt_sz);
7641debfc3dSmrg       do
7651debfc3dSmrg 	{
7661debfc3dSmrg 	  va_start (ap, first);
7671debfc3dSmrg 	  for (i = 0; i < doacross->ncounts; i++)
7681debfc3dSmrg 	    {
7691debfc3dSmrg 	      gomp_ull thisv
7701debfc3dSmrg 		= (i ? va_arg (ap, gomp_ull) : first) + 1;
7711debfc3dSmrg 	      gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
7721debfc3dSmrg 	      if (thisv < cur)
7731debfc3dSmrg 		{
7741debfc3dSmrg 		  i = doacross->ncounts;
7751debfc3dSmrg 		  break;
7761debfc3dSmrg 		}
7771debfc3dSmrg 	      if (thisv > cur)
7781debfc3dSmrg 		break;
7791debfc3dSmrg 	    }
7801debfc3dSmrg 	  va_end (ap);
7811debfc3dSmrg 	  if (i == doacross->ncounts)
7821debfc3dSmrg 	    break;
7831debfc3dSmrg 	  cpu_relax ();
7841debfc3dSmrg 	}
7851debfc3dSmrg       while (1);
7861debfc3dSmrg     }
7871debfc3dSmrg   else
7881debfc3dSmrg     {
7891debfc3dSmrg       unsigned long *array = (unsigned long *) (doacross->array
7901debfc3dSmrg 						+ ent * doacross->elt_sz);
7911debfc3dSmrg       do
7921debfc3dSmrg 	{
7931debfc3dSmrg 	  va_start (ap, first);
7941debfc3dSmrg 	  for (i = 0; i < doacross->ncounts; i++)
7951debfc3dSmrg 	    {
7961debfc3dSmrg 	      gomp_ull thisv
7971debfc3dSmrg 		= (i ? va_arg (ap, gomp_ull) : first) + 1;
7981debfc3dSmrg 	      unsigned long t
7991debfc3dSmrg 		= thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
8001debfc3dSmrg 	      unsigned long cur
8011debfc3dSmrg 		= __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED);
8021debfc3dSmrg 	      if (t < cur)
8031debfc3dSmrg 		{
8041debfc3dSmrg 		  i = doacross->ncounts;
8051debfc3dSmrg 		  break;
8061debfc3dSmrg 		}
8071debfc3dSmrg 	      if (t > cur)
8081debfc3dSmrg 		break;
8091debfc3dSmrg 	      t = thisv;
8101debfc3dSmrg 	      cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED);
8111debfc3dSmrg 	      if (t < cur)
8121debfc3dSmrg 		{
8131debfc3dSmrg 		  i = doacross->ncounts;
8141debfc3dSmrg 		  break;
8151debfc3dSmrg 		}
8161debfc3dSmrg 	      if (t > cur)
8171debfc3dSmrg 		break;
8181debfc3dSmrg 	    }
8191debfc3dSmrg 	  va_end (ap);
8201debfc3dSmrg 	  if (i == doacross->ncounts)
8211debfc3dSmrg 	    break;
8221debfc3dSmrg 	  cpu_relax ();
8231debfc3dSmrg 	}
8241debfc3dSmrg       while (1);
8251debfc3dSmrg     }
8261debfc3dSmrg   __sync_synchronize ();
8271debfc3dSmrg }
828