dist/libgomp/ordered.c

*8feb0f0bSmrg/* Copyright (C) 2005-2020 Free Software Foundation, Inc.
1debfc3dSmrg   Contributed by Richard Henderson <rth@redhat.com>.
1debfc3dSmrg
1debfc3dSmrg   This file is part of the GNU Offloading and Multi Processing Library
1debfc3dSmrg   (libgomp).
1debfc3dSmrg
1debfc3dSmrg   Libgomp is free software; you can redistribute it and/or modify it
1debfc3dSmrg   under the terms of the GNU General Public License as published by
1debfc3dSmrg   the Free Software Foundation; either version 3, or (at your option)
1debfc3dSmrg   any later version.
1debfc3dSmrg
1debfc3dSmrg   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
1debfc3dSmrg   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1debfc3dSmrg   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
1debfc3dSmrg   more details.
1debfc3dSmrg
1debfc3dSmrg   Under Section 7 of GPL version 3, you are granted additional
1debfc3dSmrg   permissions described in the GCC Runtime Library Exception, version
1debfc3dSmrg   3.1, as published by the Free Software Foundation.
1debfc3dSmrg
1debfc3dSmrg   You should have received a copy of the GNU General Public License and
1debfc3dSmrg   a copy of the GCC Runtime Library Exception along with this program;
1debfc3dSmrg   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
1debfc3dSmrg   <http://www.gnu.org/licenses/>.  */
1debfc3dSmrg
1debfc3dSmrg/* This file handles the ORDERED construct.  */
1debfc3dSmrg
1debfc3dSmrg#include "libgomp.h"
1debfc3dSmrg#include <stdarg.h>
1debfc3dSmrg#include <string.h>
1debfc3dSmrg#include "doacross.h"
1debfc3dSmrg
1debfc3dSmrg
1debfc3dSmrg/* This function is called when first allocating an iteration block.  That
1debfc3dSmrg   is, the thread is not currently on the queue.  The work-share lock must
1debfc3dSmrg   be held on entry.  */
1debfc3dSmrg
1debfc3dSmrgvoid
1debfc3dSmrggomp_ordered_first (void)
1debfc3dSmrg{
1debfc3dSmrg  struct gomp_thread *thr = gomp_thread ();
1debfc3dSmrg  struct gomp_team *team = thr->ts.team;
1debfc3dSmrg  struct gomp_work_share *ws = thr->ts.work_share;
1debfc3dSmrg  unsigned index;
1debfc3dSmrg
1debfc3dSmrg  /* Work share constructs can be orphaned.  */
1debfc3dSmrg  if (team == NULL || team->nthreads == 1)
1debfc3dSmrg    return;
1debfc3dSmrg
1debfc3dSmrg  index = ws->ordered_cur + ws->ordered_num_used;
1debfc3dSmrg  if (index >= team->nthreads)
1debfc3dSmrg    index -= team->nthreads;
1debfc3dSmrg  ws->ordered_team_ids[index] = thr->ts.team_id;
1debfc3dSmrg
1debfc3dSmrg  /* If this is the first and only thread in the queue, then there is
1debfc3dSmrg     no one to release us when we get to our ordered section.  Post to
1debfc3dSmrg     our own release queue now so that we won't block later.  */
1debfc3dSmrg  if (ws->ordered_num_used++ == 0)
1debfc3dSmrg    gomp_sem_post (team->ordered_release[thr->ts.team_id]);
1debfc3dSmrg}
1debfc3dSmrg
1debfc3dSmrg/* This function is called when completing the last iteration block.  That
1debfc3dSmrg   is, there are no more iterations to perform and so the thread should be
1debfc3dSmrg   removed from the queue entirely.  Because of the way ORDERED blocks are
1debfc3dSmrg   managed, it follows that we currently own access to the ORDERED block,
1debfc3dSmrg   and should now pass it on to the next thread.  The work-share lock must
1debfc3dSmrg   be held on entry.  */
1debfc3dSmrg
1debfc3dSmrgvoid
1debfc3dSmrggomp_ordered_last (void)
1debfc3dSmrg{
1debfc3dSmrg  struct gomp_thread *thr = gomp_thread ();
1debfc3dSmrg  struct gomp_team *team = thr->ts.team;
1debfc3dSmrg  struct gomp_work_share *ws = thr->ts.work_share;
1debfc3dSmrg  unsigned next_id;
1debfc3dSmrg
1debfc3dSmrg  /* Work share constructs can be orphaned.  */
1debfc3dSmrg  if (team == NULL || team->nthreads == 1)
1debfc3dSmrg    return;
1debfc3dSmrg
1debfc3dSmrg  /* We're no longer the owner.  */
1debfc3dSmrg  ws->ordered_owner = -1;
1debfc3dSmrg
1debfc3dSmrg  /* If we're not the last thread in the queue, then wake the next.  */
1debfc3dSmrg  if (--ws->ordered_num_used > 0)
1debfc3dSmrg    {
1debfc3dSmrg      unsigned next = ws->ordered_cur + 1;
1debfc3dSmrg      if (next == team->nthreads)
1debfc3dSmrg	next = 0;
1debfc3dSmrg      ws->ordered_cur = next;
1debfc3dSmrg
1debfc3dSmrg      next_id = ws->ordered_team_ids[next];
1debfc3dSmrg      gomp_sem_post (team->ordered_release[next_id]);
1debfc3dSmrg    }
1debfc3dSmrg}
1debfc3dSmrg
1debfc3dSmrg
1debfc3dSmrg/* This function is called when allocating a subsequent allocation block.
1debfc3dSmrg   That is, we're done with the current iteration block and we're allocating
1debfc3dSmrg   another.  This is the logical combination of a call to gomp_ordered_last
1debfc3dSmrg   followed by a call to gomp_ordered_first.  The work-share lock must be
1debfc3dSmrg   held on entry. */
1debfc3dSmrg
1debfc3dSmrgvoid
1debfc3dSmrggomp_ordered_next (void)
1debfc3dSmrg{
1debfc3dSmrg  struct gomp_thread *thr = gomp_thread ();
1debfc3dSmrg  struct gomp_team *team = thr->ts.team;
1debfc3dSmrg  struct gomp_work_share *ws = thr->ts.work_share;
1debfc3dSmrg  unsigned index, next_id;
1debfc3dSmrg
1debfc3dSmrg  /* Work share constructs can be orphaned.  */
1debfc3dSmrg  if (team == NULL || team->nthreads == 1)
1debfc3dSmrg    return;
1debfc3dSmrg
1debfc3dSmrg  /* We're no longer the owner.  */
1debfc3dSmrg  ws->ordered_owner = -1;
1debfc3dSmrg
1debfc3dSmrg  /* If there's only one thread in the queue, that must be us.  */
1debfc3dSmrg  if (ws->ordered_num_used == 1)
1debfc3dSmrg    {
1debfc3dSmrg      /* We have a similar situation as in gomp_ordered_first
1debfc3dSmrg	 where we need to post to our own release semaphore.  */
1debfc3dSmrg      gomp_sem_post (team->ordered_release[thr->ts.team_id]);
1debfc3dSmrg      return;
1debfc3dSmrg    }
1debfc3dSmrg
1debfc3dSmrg  /* If the queue is entirely full, then we move ourself to the end of
1debfc3dSmrg     the queue merely by incrementing ordered_cur.  Only if it's not
1debfc3dSmrg     full do we have to write our id.  */
1debfc3dSmrg  if (ws->ordered_num_used < team->nthreads)
1debfc3dSmrg    {
1debfc3dSmrg      index = ws->ordered_cur + ws->ordered_num_used;
1debfc3dSmrg      if (index >= team->nthreads)
1debfc3dSmrg	index -= team->nthreads;
1debfc3dSmrg      ws->ordered_team_ids[index] = thr->ts.team_id;
1debfc3dSmrg    }
1debfc3dSmrg
1debfc3dSmrg  index = ws->ordered_cur + 1;
1debfc3dSmrg  if (index == team->nthreads)
1debfc3dSmrg    index = 0;
1debfc3dSmrg  ws->ordered_cur = index;
1debfc3dSmrg
1debfc3dSmrg  next_id = ws->ordered_team_ids[index];
1debfc3dSmrg  gomp_sem_post (team->ordered_release[next_id]);
1debfc3dSmrg}
1debfc3dSmrg
1debfc3dSmrg
1debfc3dSmrg/* This function is called when a statically scheduled loop is first
1debfc3dSmrg   being created.  */
1debfc3dSmrg
1debfc3dSmrgvoid
1debfc3dSmrggomp_ordered_static_init (void)
1debfc3dSmrg{
1debfc3dSmrg  struct gomp_thread *thr = gomp_thread ();
1debfc3dSmrg  struct gomp_team *team = thr->ts.team;
1debfc3dSmrg
1debfc3dSmrg  if (team == NULL || team->nthreads == 1)
1debfc3dSmrg    return;
1debfc3dSmrg
1debfc3dSmrg  gomp_sem_post (team->ordered_release[0]);
1debfc3dSmrg}
1debfc3dSmrg
1debfc3dSmrg/* This function is called when a statically scheduled loop is moving to
1debfc3dSmrg   the next allocation block.  Static schedules are not first come first
1debfc3dSmrg   served like the others, so we're to move to the numerically next thread,
1debfc3dSmrg   not the next thread on a list.  The work-share lock should *not* be held
1debfc3dSmrg   on entry.  */
1debfc3dSmrg
1debfc3dSmrgvoid
1debfc3dSmrggomp_ordered_static_next (void)
1debfc3dSmrg{
1debfc3dSmrg  struct gomp_thread *thr = gomp_thread ();
1debfc3dSmrg  struct gomp_team *team = thr->ts.team;
1debfc3dSmrg  struct gomp_work_share *ws = thr->ts.work_share;
1debfc3dSmrg  unsigned id = thr->ts.team_id;
1debfc3dSmrg
1debfc3dSmrg  if (team == NULL || team->nthreads == 1)
1debfc3dSmrg    return;
1debfc3dSmrg
1debfc3dSmrg  ws->ordered_owner = -1;
1debfc3dSmrg
1debfc3dSmrg  /* This thread currently owns the lock.  Increment the owner.  */
1debfc3dSmrg  if (++id == team->nthreads)
1debfc3dSmrg    id = 0;
1debfc3dSmrg  ws->ordered_team_ids[0] = id;
1debfc3dSmrg  gomp_sem_post (team->ordered_release[id]);
1debfc3dSmrg}
1debfc3dSmrg
1debfc3dSmrg/* This function is called when we need to assert that the thread owns the
1debfc3dSmrg   ordered section.  Due to the problem of posted-but-not-waited semaphores,
1debfc3dSmrg   this needs to happen before completing a loop iteration.  */
1debfc3dSmrg
1debfc3dSmrgvoid
1debfc3dSmrggomp_ordered_sync (void)
1debfc3dSmrg{
1debfc3dSmrg  struct gomp_thread *thr = gomp_thread ();
1debfc3dSmrg  struct gomp_team *team = thr->ts.team;
1debfc3dSmrg  struct gomp_work_share *ws = thr->ts.work_share;
1debfc3dSmrg
1debfc3dSmrg  /* Work share constructs can be orphaned.  But this clearly means that
1debfc3dSmrg     we are the only thread, and so we automatically own the section.  */
1debfc3dSmrg  if (team == NULL || team->nthreads == 1)
1debfc3dSmrg    return;
1debfc3dSmrg
1debfc3dSmrg  /* ??? I believe it to be safe to access this data without taking the
1debfc3dSmrg     ws->lock.  The only presumed race condition is with the previous
1debfc3dSmrg     thread on the queue incrementing ordered_cur such that it points
1debfc3dSmrg     to us, concurrently with our check below.  But our team_id is
1debfc3dSmrg     already present in the queue, and the other thread will always
1debfc3dSmrg     post to our release semaphore.  So the two cases are that we will
1debfc3dSmrg     either win the race an momentarily block on the semaphore, or lose
1debfc3dSmrg     the race and find the semaphore already unlocked and so not block.
1debfc3dSmrg     Either way we get correct results.
1debfc3dSmrg     However, there is an implicit flush on entry to an ordered region,
1debfc3dSmrg     so we do need to have a barrier here.  If we were taking a lock
*8feb0f0bSmrg     this could be MEMMODEL_RELEASE since the acquire would be covered
1debfc3dSmrg     by the lock.  */
1debfc3dSmrg
1debfc3dSmrg  __atomic_thread_fence (MEMMODEL_ACQ_REL);
1debfc3dSmrg  if (ws->ordered_owner != thr->ts.team_id)
1debfc3dSmrg    {
1debfc3dSmrg      gomp_sem_wait (team->ordered_release[thr->ts.team_id]);
1debfc3dSmrg      ws->ordered_owner = thr->ts.team_id;
1debfc3dSmrg    }
1debfc3dSmrg}
1debfc3dSmrg
1debfc3dSmrg/* This function is called by user code when encountering the start of an
1debfc3dSmrg   ORDERED block.  We must check to see if the current thread is at the
1debfc3dSmrg   head of the queue, and if not, block.  */
1debfc3dSmrg
1debfc3dSmrg#ifdef HAVE_ATTRIBUTE_ALIAS
1debfc3dSmrgextern void GOMP_ordered_start (void)
1debfc3dSmrg	__attribute__((alias ("gomp_ordered_sync")));
1debfc3dSmrg#else
1debfc3dSmrgvoid
1debfc3dSmrgGOMP_ordered_start (void)
1debfc3dSmrg{
1debfc3dSmrg  gomp_ordered_sync ();
1debfc3dSmrg}
1debfc3dSmrg#endif
1debfc3dSmrg
1debfc3dSmrg/* This function is called by user code when encountering the end of an
1debfc3dSmrg   ORDERED block.  With the current ORDERED implementation there's nothing
1debfc3dSmrg   for us to do.
1debfc3dSmrg
1debfc3dSmrg   However, the current implementation has a flaw in that it does not allow
1debfc3dSmrg   the next thread into the ORDERED section immediately after the current
*8feb0f0bSmrg   thread exits the ORDERED section in its last iteration.  The existence
1debfc3dSmrg   of this function allows the implementation to change.  */
1debfc3dSmrg
1debfc3dSmrgvoid
1debfc3dSmrgGOMP_ordered_end (void)
1debfc3dSmrg{
1debfc3dSmrg}
1debfc3dSmrg
1debfc3dSmrg/* DOACROSS initialization.  */
1debfc3dSmrg
1debfc3dSmrg#define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
1debfc3dSmrg
1debfc3dSmrgvoid
c0a68be4Smrggomp_doacross_init (unsigned ncounts, long *counts, long chunk_size,
c0a68be4Smrg		    size_t extra)
1debfc3dSmrg{
1debfc3dSmrg  struct gomp_thread *thr = gomp_thread ();
1debfc3dSmrg  struct gomp_team *team = thr->ts.team;
1debfc3dSmrg  struct gomp_work_share *ws = thr->ts.work_share;
1debfc3dSmrg  unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
1debfc3dSmrg  unsigned long ent, num_ents, elt_sz, shift_sz;
1debfc3dSmrg  struct gomp_doacross_work_share *doacross;
1debfc3dSmrg
1debfc3dSmrg  if (team == NULL || team->nthreads == 1)
c0a68be4Smrg    {
c0a68be4Smrg    empty:
c0a68be4Smrg      if (!extra)
c0a68be4Smrg	ws->doacross = NULL;
c0a68be4Smrg      else
c0a68be4Smrg	{
c0a68be4Smrg	  doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
c0a68be4Smrg	  doacross->extra = (void *) (doacross + 1);
c0a68be4Smrg	  ws->doacross = doacross;
c0a68be4Smrg	}
1debfc3dSmrg      return;
c0a68be4Smrg    }
1debfc3dSmrg
1debfc3dSmrg  for (i = 0; i < ncounts; i++)
1debfc3dSmrg    {
1debfc3dSmrg      /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
1debfc3dSmrg      if (counts[i] == 0)
c0a68be4Smrg	goto empty;
1debfc3dSmrg
1debfc3dSmrg      if (num_bits <= MAX_COLLAPSED_BITS)
1debfc3dSmrg	{
1debfc3dSmrg	  unsigned int this_bits;
1debfc3dSmrg	  if (counts[i] == 1)
1debfc3dSmrg	    this_bits = 1;
1debfc3dSmrg	  else
1debfc3dSmrg	    this_bits = __SIZEOF_LONG__ * __CHAR_BIT__
1debfc3dSmrg			- __builtin_clzl (counts[i] - 1);
1debfc3dSmrg	  if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
1debfc3dSmrg	    {
1debfc3dSmrg	      bits[i] = this_bits;
1debfc3dSmrg	      num_bits += this_bits;
1debfc3dSmrg	    }
1debfc3dSmrg	  else
1debfc3dSmrg	    num_bits = MAX_COLLAPSED_BITS + 1;
1debfc3dSmrg	}
1debfc3dSmrg    }
1debfc3dSmrg
1debfc3dSmrg  if (ws->sched == GFS_STATIC)
1debfc3dSmrg    num_ents = team->nthreads;
1debfc3dSmrg  else if (ws->sched == GFS_GUIDED)
1debfc3dSmrg    num_ents = counts[0];
1debfc3dSmrg  else
1debfc3dSmrg    num_ents = (counts[0] - 1) / chunk_size + 1;
1debfc3dSmrg  if (num_bits <= MAX_COLLAPSED_BITS)
1debfc3dSmrg    {
1debfc3dSmrg      elt_sz = sizeof (unsigned long);
1debfc3dSmrg      shift_sz = ncounts * sizeof (unsigned int);
1debfc3dSmrg    }
1debfc3dSmrg  else
1debfc3dSmrg    {
1debfc3dSmrg      elt_sz = sizeof (unsigned long) * ncounts;
1debfc3dSmrg      shift_sz = 0;
1debfc3dSmrg    }
1debfc3dSmrg  elt_sz = (elt_sz + 63) & ~63UL;
1debfc3dSmrg
1debfc3dSmrg  doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
c0a68be4Smrg			  + shift_sz + extra);
1debfc3dSmrg  doacross->chunk_size = chunk_size;
1debfc3dSmrg  doacross->elt_sz = elt_sz;
1debfc3dSmrg  doacross->ncounts = ncounts;
1debfc3dSmrg  doacross->flattened = false;
1debfc3dSmrg  doacross->array = (unsigned char *)
1debfc3dSmrg		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
1debfc3dSmrg		     & ~(uintptr_t) 63);
c0a68be4Smrg  if (extra)
c0a68be4Smrg    {
c0a68be4Smrg      doacross->extra = doacross->array + num_ents * elt_sz;
c0a68be4Smrg      memset (doacross->extra, '\0', extra);
c0a68be4Smrg    }
c0a68be4Smrg  else
c0a68be4Smrg    doacross->extra = NULL;
1debfc3dSmrg  if (num_bits <= MAX_COLLAPSED_BITS)
1debfc3dSmrg    {
1debfc3dSmrg      unsigned int shift_count = 0;
1debfc3dSmrg      doacross->flattened = true;
1debfc3dSmrg      for (i = ncounts; i > 0; i--)
1debfc3dSmrg	{
1debfc3dSmrg	  doacross->shift_counts[i - 1] = shift_count;
1debfc3dSmrg	  shift_count += bits[i - 1];
1debfc3dSmrg	}
1debfc3dSmrg      for (ent = 0; ent < num_ents; ent++)
1debfc3dSmrg	*(unsigned long *) (doacross->array + ent * elt_sz) = 0;
1debfc3dSmrg    }
1debfc3dSmrg  else
1debfc3dSmrg    for (ent = 0; ent < num_ents; ent++)
1debfc3dSmrg      memset (doacross->array + ent * elt_sz, '\0',
1debfc3dSmrg	      sizeof (unsigned long) * ncounts);
1debfc3dSmrg  if (ws->sched == GFS_STATIC && chunk_size == 0)
1debfc3dSmrg    {
1debfc3dSmrg      unsigned long q = counts[0] / num_ents;
1debfc3dSmrg      unsigned long t = counts[0] % num_ents;
1debfc3dSmrg      doacross->boundary = t * (q + 1);
1debfc3dSmrg      doacross->q = q;
1debfc3dSmrg      doacross->t = t;
1debfc3dSmrg    }
1debfc3dSmrg  ws->doacross = doacross;
1debfc3dSmrg}
1debfc3dSmrg
1debfc3dSmrg/* DOACROSS POST operation.  */
1debfc3dSmrg
1debfc3dSmrgvoid
1debfc3dSmrgGOMP_doacross_post (long *counts)
1debfc3dSmrg{
1debfc3dSmrg  struct gomp_thread *thr = gomp_thread ();
1debfc3dSmrg  struct gomp_work_share *ws = thr->ts.work_share;
1debfc3dSmrg  struct gomp_doacross_work_share *doacross = ws->doacross;
1debfc3dSmrg  unsigned long ent;
1debfc3dSmrg  unsigned int i;
1debfc3dSmrg
c0a68be4Smrg  if (__builtin_expect (doacross == NULL, 0)
c0a68be4Smrg      || __builtin_expect (doacross->array == NULL, 0))
1debfc3dSmrg    {
1debfc3dSmrg      __sync_synchronize ();
1debfc3dSmrg      return;
1debfc3dSmrg    }
1debfc3dSmrg
1debfc3dSmrg  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
1debfc3dSmrg    ent = thr->ts.team_id;
1debfc3dSmrg  else if (ws->sched == GFS_GUIDED)
1debfc3dSmrg    ent = counts[0];
1debfc3dSmrg  else
1debfc3dSmrg    ent = counts[0] / doacross->chunk_size;
1debfc3dSmrg  unsigned long *array = (unsigned long *) (doacross->array
1debfc3dSmrg					    + ent * doacross->elt_sz);
1debfc3dSmrg
1debfc3dSmrg  if (__builtin_expect (doacross->flattened, 1))
1debfc3dSmrg    {
1debfc3dSmrg      unsigned long flattened
1debfc3dSmrg	= (unsigned long) counts[0] << doacross->shift_counts[0];
1debfc3dSmrg
1debfc3dSmrg      for (i = 1; i < doacross->ncounts; i++)
1debfc3dSmrg	flattened |= (unsigned long) counts[i]
1debfc3dSmrg		     << doacross->shift_counts[i];
1debfc3dSmrg      flattened++;
1debfc3dSmrg      if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
1debfc3dSmrg	__atomic_thread_fence (MEMMODEL_RELEASE);
1debfc3dSmrg      else
1debfc3dSmrg	__atomic_store_n (array, flattened, MEMMODEL_RELEASE);
1debfc3dSmrg      return;
1debfc3dSmrg    }
1debfc3dSmrg
1debfc3dSmrg  __atomic_thread_fence (MEMMODEL_ACQUIRE);
1debfc3dSmrg  for (i = doacross->ncounts; i-- > 0; )
1debfc3dSmrg    {
1debfc3dSmrg      if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
1debfc3dSmrg	__atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
1debfc3dSmrg    }
1debfc3dSmrg}
1debfc3dSmrg
1debfc3dSmrg/* DOACROSS WAIT operation.  */
1debfc3dSmrg
1debfc3dSmrgvoid
1debfc3dSmrgGOMP_doacross_wait (long first, ...)
1debfc3dSmrg{
1debfc3dSmrg  struct gomp_thread *thr = gomp_thread ();
1debfc3dSmrg  struct gomp_work_share *ws = thr->ts.work_share;
1debfc3dSmrg  struct gomp_doacross_work_share *doacross = ws->doacross;
1debfc3dSmrg  va_list ap;
1debfc3dSmrg  unsigned long ent;
1debfc3dSmrg  unsigned int i;
1debfc3dSmrg
c0a68be4Smrg  if (__builtin_expect (doacross == NULL, 0)
c0a68be4Smrg      || __builtin_expect (doacross->array == NULL, 0))
1debfc3dSmrg    {
1debfc3dSmrg      __sync_synchronize ();
1debfc3dSmrg      return;
1debfc3dSmrg    }
1debfc3dSmrg
1debfc3dSmrg  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
1debfc3dSmrg    {
1debfc3dSmrg      if (ws->chunk_size == 0)
1debfc3dSmrg	{
1debfc3dSmrg	  if (first < doacross->boundary)
1debfc3dSmrg	    ent = first / (doacross->q + 1);
1debfc3dSmrg	  else
1debfc3dSmrg	    ent = (first - doacross->boundary) / doacross->q
1debfc3dSmrg		  + doacross->t;
1debfc3dSmrg	}
1debfc3dSmrg      else
1debfc3dSmrg	ent = first / ws->chunk_size % thr->ts.team->nthreads;
1debfc3dSmrg    }
1debfc3dSmrg  else if (ws->sched == GFS_GUIDED)
1debfc3dSmrg    ent = first;
1debfc3dSmrg  else
1debfc3dSmrg    ent = first / doacross->chunk_size;
1debfc3dSmrg  unsigned long *array = (unsigned long *) (doacross->array
1debfc3dSmrg					    + ent * doacross->elt_sz);
1debfc3dSmrg
1debfc3dSmrg  if (__builtin_expect (doacross->flattened, 1))
1debfc3dSmrg    {
1debfc3dSmrg      unsigned long flattened
1debfc3dSmrg	= (unsigned long) first << doacross->shift_counts[0];
1debfc3dSmrg      unsigned long cur;
1debfc3dSmrg
1debfc3dSmrg      va_start (ap, first);
1debfc3dSmrg      for (i = 1; i < doacross->ncounts; i++)
1debfc3dSmrg	flattened |= (unsigned long) va_arg (ap, long)
1debfc3dSmrg		     << doacross->shift_counts[i];
1debfc3dSmrg      cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
1debfc3dSmrg      if (flattened < cur)
1debfc3dSmrg	{
1debfc3dSmrg	  __atomic_thread_fence (MEMMODEL_RELEASE);
1debfc3dSmrg	  va_end (ap);
1debfc3dSmrg	  return;
1debfc3dSmrg	}
1debfc3dSmrg      doacross_spin (array, flattened, cur);
1debfc3dSmrg      __atomic_thread_fence (MEMMODEL_RELEASE);
1debfc3dSmrg      va_end (ap);
1debfc3dSmrg      return;
1debfc3dSmrg    }
1debfc3dSmrg
1debfc3dSmrg  do
1debfc3dSmrg    {
1debfc3dSmrg      va_start (ap, first);
1debfc3dSmrg      for (i = 0; i < doacross->ncounts; i++)
1debfc3dSmrg	{
1debfc3dSmrg	  unsigned long thisv
1debfc3dSmrg	    = (unsigned long) (i ? va_arg (ap, long) : first) + 1;
1debfc3dSmrg	  unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
1debfc3dSmrg	  if (thisv < cur)
1debfc3dSmrg	    {
1debfc3dSmrg	      i = doacross->ncounts;
1debfc3dSmrg	      break;
1debfc3dSmrg	    }
1debfc3dSmrg	  if (thisv > cur)
1debfc3dSmrg	    break;
1debfc3dSmrg	}
1debfc3dSmrg      va_end (ap);
1debfc3dSmrg      if (i == doacross->ncounts)
1debfc3dSmrg	break;
1debfc3dSmrg      cpu_relax ();
1debfc3dSmrg    }
1debfc3dSmrg  while (1);
1debfc3dSmrg  __sync_synchronize ();
1debfc3dSmrg}
1debfc3dSmrg
1debfc3dSmrgtypedef unsigned long long gomp_ull;
1debfc3dSmrg
1debfc3dSmrgvoid
c0a68be4Smrggomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts,
c0a68be4Smrg			gomp_ull chunk_size, size_t extra)
1debfc3dSmrg{
1debfc3dSmrg  struct gomp_thread *thr = gomp_thread ();
1debfc3dSmrg  struct gomp_team *team = thr->ts.team;
1debfc3dSmrg  struct gomp_work_share *ws = thr->ts.work_share;
1debfc3dSmrg  unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
1debfc3dSmrg  unsigned long ent, num_ents, elt_sz, shift_sz;
1debfc3dSmrg  struct gomp_doacross_work_share *doacross;
1debfc3dSmrg
1debfc3dSmrg  if (team == NULL || team->nthreads == 1)
c0a68be4Smrg    {
c0a68be4Smrg    empty:
c0a68be4Smrg      if (!extra)
c0a68be4Smrg	ws->doacross = NULL;
c0a68be4Smrg      else
c0a68be4Smrg	{
c0a68be4Smrg	  doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
c0a68be4Smrg	  doacross->extra = (void *) (doacross + 1);
c0a68be4Smrg	  ws->doacross = doacross;
c0a68be4Smrg	}
1debfc3dSmrg      return;
c0a68be4Smrg    }
1debfc3dSmrg
1debfc3dSmrg  for (i = 0; i < ncounts; i++)
1debfc3dSmrg    {
1debfc3dSmrg      /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
1debfc3dSmrg      if (counts[i] == 0)
c0a68be4Smrg	goto empty;
1debfc3dSmrg
1debfc3dSmrg      if (num_bits <= MAX_COLLAPSED_BITS)
1debfc3dSmrg	{
1debfc3dSmrg	  unsigned int this_bits;
1debfc3dSmrg	  if (counts[i] == 1)
1debfc3dSmrg	    this_bits = 1;
1debfc3dSmrg	  else
1debfc3dSmrg	    this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__
1debfc3dSmrg			- __builtin_clzll (counts[i] - 1);
1debfc3dSmrg	  if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
1debfc3dSmrg	    {
1debfc3dSmrg	      bits[i] = this_bits;
1debfc3dSmrg	      num_bits += this_bits;
1debfc3dSmrg	    }
1debfc3dSmrg	  else
1debfc3dSmrg	    num_bits = MAX_COLLAPSED_BITS + 1;
1debfc3dSmrg	}
1debfc3dSmrg    }
1debfc3dSmrg
1debfc3dSmrg  if (ws->sched == GFS_STATIC)
1debfc3dSmrg    num_ents = team->nthreads;
1debfc3dSmrg  else if (ws->sched == GFS_GUIDED)
1debfc3dSmrg    num_ents = counts[0];
1debfc3dSmrg  else
1debfc3dSmrg    num_ents = (counts[0] - 1) / chunk_size + 1;
1debfc3dSmrg  if (num_bits <= MAX_COLLAPSED_BITS)
1debfc3dSmrg    {
1debfc3dSmrg      elt_sz = sizeof (unsigned long);
1debfc3dSmrg      shift_sz = ncounts * sizeof (unsigned int);
1debfc3dSmrg    }
1debfc3dSmrg  else
1debfc3dSmrg    {
1debfc3dSmrg      if (sizeof (gomp_ull) == sizeof (unsigned long))
1debfc3dSmrg	elt_sz = sizeof (gomp_ull) * ncounts;
1debfc3dSmrg      else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long))
1debfc3dSmrg	elt_sz = sizeof (unsigned long) * 2 * ncounts;
1debfc3dSmrg      else
1debfc3dSmrg	abort ();
1debfc3dSmrg      shift_sz = 0;
1debfc3dSmrg    }
1debfc3dSmrg  elt_sz = (elt_sz + 63) & ~63UL;
1debfc3dSmrg
1debfc3dSmrg  doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
1debfc3dSmrg			  + shift_sz);
1debfc3dSmrg  doacross->chunk_size_ull = chunk_size;
1debfc3dSmrg  doacross->elt_sz = elt_sz;
1debfc3dSmrg  doacross->ncounts = ncounts;
1debfc3dSmrg  doacross->flattened = false;
1debfc3dSmrg  doacross->boundary = 0;
1debfc3dSmrg  doacross->array = (unsigned char *)
1debfc3dSmrg		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
1debfc3dSmrg		     & ~(uintptr_t) 63);
c0a68be4Smrg  if (extra)
c0a68be4Smrg    {
c0a68be4Smrg      doacross->extra = doacross->array + num_ents * elt_sz;
c0a68be4Smrg      memset (doacross->extra, '\0', extra);
c0a68be4Smrg    }
c0a68be4Smrg  else
c0a68be4Smrg    doacross->extra = NULL;
1debfc3dSmrg  if (num_bits <= MAX_COLLAPSED_BITS)
1debfc3dSmrg    {
1debfc3dSmrg      unsigned int shift_count = 0;
1debfc3dSmrg      doacross->flattened = true;
1debfc3dSmrg      for (i = ncounts; i > 0; i--)
1debfc3dSmrg	{
1debfc3dSmrg	  doacross->shift_counts[i - 1] = shift_count;
1debfc3dSmrg	  shift_count += bits[i - 1];
1debfc3dSmrg	}
1debfc3dSmrg      for (ent = 0; ent < num_ents; ent++)
1debfc3dSmrg	*(unsigned long *) (doacross->array + ent * elt_sz) = 0;
1debfc3dSmrg    }
1debfc3dSmrg  else
1debfc3dSmrg    for (ent = 0; ent < num_ents; ent++)
1debfc3dSmrg      memset (doacross->array + ent * elt_sz, '\0',
1debfc3dSmrg	      sizeof (unsigned long) * ncounts);
1debfc3dSmrg  if (ws->sched == GFS_STATIC && chunk_size == 0)
1debfc3dSmrg    {
1debfc3dSmrg      gomp_ull q = counts[0] / num_ents;
1debfc3dSmrg      gomp_ull t = counts[0] % num_ents;
1debfc3dSmrg      doacross->boundary_ull = t * (q + 1);
1debfc3dSmrg      doacross->q_ull = q;
1debfc3dSmrg      doacross->t = t;
1debfc3dSmrg    }
1debfc3dSmrg  ws->doacross = doacross;
1debfc3dSmrg}
1debfc3dSmrg
1debfc3dSmrg/* DOACROSS POST operation.  */
1debfc3dSmrg
1debfc3dSmrgvoid
1debfc3dSmrgGOMP_doacross_ull_post (gomp_ull *counts)
1debfc3dSmrg{
1debfc3dSmrg  struct gomp_thread *thr = gomp_thread ();
1debfc3dSmrg  struct gomp_work_share *ws = thr->ts.work_share;
1debfc3dSmrg  struct gomp_doacross_work_share *doacross = ws->doacross;
1debfc3dSmrg  unsigned long ent;
1debfc3dSmrg  unsigned int i;
1debfc3dSmrg
c0a68be4Smrg  if (__builtin_expect (doacross == NULL, 0)
c0a68be4Smrg      || __builtin_expect (doacross->array == NULL, 0))
1debfc3dSmrg    {
1debfc3dSmrg      __sync_synchronize ();
1debfc3dSmrg      return;
1debfc3dSmrg    }
1debfc3dSmrg
1debfc3dSmrg  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
1debfc3dSmrg    ent = thr->ts.team_id;
1debfc3dSmrg  else if (ws->sched == GFS_GUIDED)
1debfc3dSmrg    ent = counts[0];
1debfc3dSmrg  else
1debfc3dSmrg    ent = counts[0] / doacross->chunk_size_ull;
1debfc3dSmrg
1debfc3dSmrg  if (__builtin_expect (doacross->flattened, 1))
1debfc3dSmrg    {
1debfc3dSmrg      unsigned long *array = (unsigned long *) (doacross->array
1debfc3dSmrg			      + ent * doacross->elt_sz);
1debfc3dSmrg      gomp_ull flattened
1debfc3dSmrg	= counts[0] << doacross->shift_counts[0];
1debfc3dSmrg
1debfc3dSmrg      for (i = 1; i < doacross->ncounts; i++)
1debfc3dSmrg	flattened |= counts[i] << doacross->shift_counts[i];
1debfc3dSmrg      flattened++;
1debfc3dSmrg      if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
1debfc3dSmrg	__atomic_thread_fence (MEMMODEL_RELEASE);
1debfc3dSmrg      else
1debfc3dSmrg	__atomic_store_n (array, flattened, MEMMODEL_RELEASE);
1debfc3dSmrg      return;
1debfc3dSmrg    }
1debfc3dSmrg
1debfc3dSmrg  __atomic_thread_fence (MEMMODEL_ACQUIRE);
1debfc3dSmrg  if (sizeof (gomp_ull) == sizeof (unsigned long))
1debfc3dSmrg    {
1debfc3dSmrg      gomp_ull *array = (gomp_ull *) (doacross->array
1debfc3dSmrg				      + ent * doacross->elt_sz);
1debfc3dSmrg
1debfc3dSmrg      for (i = doacross->ncounts; i-- > 0; )
1debfc3dSmrg	{
1debfc3dSmrg	  if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
1debfc3dSmrg	    __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
1debfc3dSmrg	}
1debfc3dSmrg    }
1debfc3dSmrg  else
1debfc3dSmrg    {
1debfc3dSmrg      unsigned long *array = (unsigned long *) (doacross->array
1debfc3dSmrg						+ ent * doacross->elt_sz);
1debfc3dSmrg
1debfc3dSmrg      for (i = doacross->ncounts; i-- > 0; )
1debfc3dSmrg	{
1debfc3dSmrg	  gomp_ull cull = counts[i] + 1UL;
1debfc3dSmrg	  unsigned long c = (unsigned long) cull;
1debfc3dSmrg	  if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED))
1debfc3dSmrg	    __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE);
1debfc3dSmrg	  c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
1debfc3dSmrg	  if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED))
1debfc3dSmrg	    __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE);
1debfc3dSmrg	}
1debfc3dSmrg    }
1debfc3dSmrg}
1debfc3dSmrg
1debfc3dSmrg/* DOACROSS WAIT operation.  */
1debfc3dSmrg
1debfc3dSmrgvoid
1debfc3dSmrgGOMP_doacross_ull_wait (gomp_ull first, ...)
1debfc3dSmrg{
1debfc3dSmrg  struct gomp_thread *thr = gomp_thread ();
1debfc3dSmrg  struct gomp_work_share *ws = thr->ts.work_share;
1debfc3dSmrg  struct gomp_doacross_work_share *doacross = ws->doacross;
1debfc3dSmrg  va_list ap;
1debfc3dSmrg  unsigned long ent;
1debfc3dSmrg  unsigned int i;
1debfc3dSmrg
c0a68be4Smrg  if (__builtin_expect (doacross == NULL, 0)
c0a68be4Smrg      || __builtin_expect (doacross->array == NULL, 0))
1debfc3dSmrg    {
1debfc3dSmrg      __sync_synchronize ();
1debfc3dSmrg      return;
1debfc3dSmrg    }
1debfc3dSmrg
1debfc3dSmrg  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
1debfc3dSmrg    {
1debfc3dSmrg      if (ws->chunk_size_ull == 0)
1debfc3dSmrg	{
1debfc3dSmrg	  if (first < doacross->boundary_ull)
1debfc3dSmrg	    ent = first / (doacross->q_ull + 1);
1debfc3dSmrg	  else
1debfc3dSmrg	    ent = (first - doacross->boundary_ull) / doacross->q_ull
1debfc3dSmrg		  + doacross->t;
1debfc3dSmrg	}
1debfc3dSmrg      else
1debfc3dSmrg	ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
1debfc3dSmrg    }
1debfc3dSmrg  else if (ws->sched == GFS_GUIDED)
1debfc3dSmrg    ent = first;
1debfc3dSmrg  else
1debfc3dSmrg    ent = first / doacross->chunk_size_ull;
1debfc3dSmrg
1debfc3dSmrg  if (__builtin_expect (doacross->flattened, 1))
1debfc3dSmrg    {
1debfc3dSmrg      unsigned long *array = (unsigned long *) (doacross->array
1debfc3dSmrg						+ ent * doacross->elt_sz);
1debfc3dSmrg      gomp_ull flattened = first << doacross->shift_counts[0];
1debfc3dSmrg      unsigned long cur;
1debfc3dSmrg
1debfc3dSmrg      va_start (ap, first);
1debfc3dSmrg      for (i = 1; i < doacross->ncounts; i++)
1debfc3dSmrg	flattened |= va_arg (ap, gomp_ull)
1debfc3dSmrg		     << doacross->shift_counts[i];
1debfc3dSmrg      cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
1debfc3dSmrg      if (flattened < cur)
1debfc3dSmrg	{
1debfc3dSmrg	  __atomic_thread_fence (MEMMODEL_RELEASE);
1debfc3dSmrg	  va_end (ap);
1debfc3dSmrg	  return;
1debfc3dSmrg	}
1debfc3dSmrg      doacross_spin (array, flattened, cur);
1debfc3dSmrg      __atomic_thread_fence (MEMMODEL_RELEASE);
1debfc3dSmrg      va_end (ap);
1debfc3dSmrg      return;
1debfc3dSmrg    }
1debfc3dSmrg
1debfc3dSmrg  if (sizeof (gomp_ull) == sizeof (unsigned long))
1debfc3dSmrg    {
1debfc3dSmrg      gomp_ull *array = (gomp_ull *) (doacross->array
1debfc3dSmrg				      + ent * doacross->elt_sz);
1debfc3dSmrg      do
1debfc3dSmrg	{
1debfc3dSmrg	  va_start (ap, first);
1debfc3dSmrg	  for (i = 0; i < doacross->ncounts; i++)
1debfc3dSmrg	    {
1debfc3dSmrg	      gomp_ull thisv
1debfc3dSmrg		= (i ? va_arg (ap, gomp_ull) : first) + 1;
1debfc3dSmrg	      gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
1debfc3dSmrg	      if (thisv < cur)
1debfc3dSmrg		{
1debfc3dSmrg		  i = doacross->ncounts;
1debfc3dSmrg		  break;
1debfc3dSmrg		}
1debfc3dSmrg	      if (thisv > cur)
1debfc3dSmrg		break;
1debfc3dSmrg	    }
1debfc3dSmrg	  va_end (ap);
1debfc3dSmrg	  if (i == doacross->ncounts)
1debfc3dSmrg	    break;
1debfc3dSmrg	  cpu_relax ();
1debfc3dSmrg	}
1debfc3dSmrg      while (1);
1debfc3dSmrg    }
1debfc3dSmrg  else
1debfc3dSmrg    {
1debfc3dSmrg      unsigned long *array = (unsigned long *) (doacross->array
1debfc3dSmrg						+ ent * doacross->elt_sz);
1debfc3dSmrg      do
1debfc3dSmrg	{
1debfc3dSmrg	  va_start (ap, first);
1debfc3dSmrg	  for (i = 0; i < doacross->ncounts; i++)
1debfc3dSmrg	    {
1debfc3dSmrg	      gomp_ull thisv
1debfc3dSmrg		= (i ? va_arg (ap, gomp_ull) : first) + 1;
1debfc3dSmrg	      unsigned long t
1debfc3dSmrg		= thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
1debfc3dSmrg	      unsigned long cur
1debfc3dSmrg		= __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED);
1debfc3dSmrg	      if (t < cur)
1debfc3dSmrg		{
1debfc3dSmrg		  i = doacross->ncounts;
1debfc3dSmrg		  break;
1debfc3dSmrg		}
1debfc3dSmrg	      if (t > cur)
1debfc3dSmrg		break;
1debfc3dSmrg	      t = thisv;
1debfc3dSmrg	      cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED);
1debfc3dSmrg	      if (t < cur)
1debfc3dSmrg		{
1debfc3dSmrg		  i = doacross->ncounts;
1debfc3dSmrg		  break;
1debfc3dSmrg		}
1debfc3dSmrg	      if (t > cur)
1debfc3dSmrg		break;
1debfc3dSmrg	    }
1debfc3dSmrg	  va_end (ap);
1debfc3dSmrg	  if (i == doacross->ncounts)
1debfc3dSmrg	    break;
1debfc3dSmrg	  cpu_relax ();
1debfc3dSmrg	}
1debfc3dSmrg      while (1);
1debfc3dSmrg    }
1debfc3dSmrg  __sync_synchronize ();
1debfc3dSmrg}