gcc-8.0/libgomp/ordered.c

*38fd1498Szrj/* Copyright (C) 2005-2018 Free Software Foundation, Inc.
*38fd1498Szrj   Contributed by Richard Henderson <rth@redhat.com>.
*38fd1498Szrj
*38fd1498Szrj   This file is part of the GNU Offloading and Multi Processing Library
*38fd1498Szrj   (libgomp).
*38fd1498Szrj
*38fd1498Szrj   Libgomp is free software; you can redistribute it and/or modify it
*38fd1498Szrj   under the terms of the GNU General Public License as published by
*38fd1498Szrj   the Free Software Foundation; either version 3, or (at your option)
*38fd1498Szrj   any later version.
*38fd1498Szrj
*38fd1498Szrj   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
*38fd1498Szrj   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
*38fd1498Szrj   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
*38fd1498Szrj   more details.
*38fd1498Szrj
*38fd1498Szrj   Under Section 7 of GPL version 3, you are granted additional
*38fd1498Szrj   permissions described in the GCC Runtime Library Exception, version
*38fd1498Szrj   3.1, as published by the Free Software Foundation.
*38fd1498Szrj
*38fd1498Szrj   You should have received a copy of the GNU General Public License and
*38fd1498Szrj   a copy of the GCC Runtime Library Exception along with this program;
*38fd1498Szrj   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
*38fd1498Szrj   <http://www.gnu.org/licenses/>.  */
*38fd1498Szrj
*38fd1498Szrj/* This file handles the ORDERED construct.  */
*38fd1498Szrj
*38fd1498Szrj#include "libgomp.h"
*38fd1498Szrj#include <stdarg.h>
*38fd1498Szrj#include <string.h>
*38fd1498Szrj#include "doacross.h"
*38fd1498Szrj
*38fd1498Szrj
*38fd1498Szrj/* This function is called when first allocating an iteration block.  That
*38fd1498Szrj   is, the thread is not currently on the queue.  The work-share lock must
*38fd1498Szrj   be held on entry.  */
*38fd1498Szrj
*38fd1498Szrjvoid
*38fd1498Szrjgomp_ordered_first (void)
*38fd1498Szrj{
*38fd1498Szrj  struct gomp_thread *thr = gomp_thread ();
*38fd1498Szrj  struct gomp_team *team = thr->ts.team;
*38fd1498Szrj  struct gomp_work_share *ws = thr->ts.work_share;
*38fd1498Szrj  unsigned index;
*38fd1498Szrj
*38fd1498Szrj  /* Work share constructs can be orphaned.  */
*38fd1498Szrj  if (team == NULL || team->nthreads == 1)
*38fd1498Szrj    return;
*38fd1498Szrj
*38fd1498Szrj  index = ws->ordered_cur + ws->ordered_num_used;
*38fd1498Szrj  if (index >= team->nthreads)
*38fd1498Szrj    index -= team->nthreads;
*38fd1498Szrj  ws->ordered_team_ids[index] = thr->ts.team_id;
*38fd1498Szrj
*38fd1498Szrj  /* If this is the first and only thread in the queue, then there is
*38fd1498Szrj     no one to release us when we get to our ordered section.  Post to
*38fd1498Szrj     our own release queue now so that we won't block later.  */
*38fd1498Szrj  if (ws->ordered_num_used++ == 0)
*38fd1498Szrj    gomp_sem_post (team->ordered_release[thr->ts.team_id]);
*38fd1498Szrj}
*38fd1498Szrj
*38fd1498Szrj/* This function is called when completing the last iteration block.  That
*38fd1498Szrj   is, there are no more iterations to perform and so the thread should be
*38fd1498Szrj   removed from the queue entirely.  Because of the way ORDERED blocks are
*38fd1498Szrj   managed, it follows that we currently own access to the ORDERED block,
*38fd1498Szrj   and should now pass it on to the next thread.  The work-share lock must
*38fd1498Szrj   be held on entry.  */
*38fd1498Szrj
*38fd1498Szrjvoid
*38fd1498Szrjgomp_ordered_last (void)
*38fd1498Szrj{
*38fd1498Szrj  struct gomp_thread *thr = gomp_thread ();
*38fd1498Szrj  struct gomp_team *team = thr->ts.team;
*38fd1498Szrj  struct gomp_work_share *ws = thr->ts.work_share;
*38fd1498Szrj  unsigned next_id;
*38fd1498Szrj
*38fd1498Szrj  /* Work share constructs can be orphaned.  */
*38fd1498Szrj  if (team == NULL || team->nthreads == 1)
*38fd1498Szrj    return;
*38fd1498Szrj
*38fd1498Szrj  /* We're no longer the owner.  */
*38fd1498Szrj  ws->ordered_owner = -1;
*38fd1498Szrj
*38fd1498Szrj  /* If we're not the last thread in the queue, then wake the next.  */
*38fd1498Szrj  if (--ws->ordered_num_used > 0)
*38fd1498Szrj    {
*38fd1498Szrj      unsigned next = ws->ordered_cur + 1;
*38fd1498Szrj      if (next == team->nthreads)
*38fd1498Szrj	next = 0;
*38fd1498Szrj      ws->ordered_cur = next;
*38fd1498Szrj
*38fd1498Szrj      next_id = ws->ordered_team_ids[next];
*38fd1498Szrj      gomp_sem_post (team->ordered_release[next_id]);
*38fd1498Szrj    }
*38fd1498Szrj}
*38fd1498Szrj
*38fd1498Szrj
*38fd1498Szrj/* This function is called when allocating a subsequent allocation block.
*38fd1498Szrj   That is, we're done with the current iteration block and we're allocating
*38fd1498Szrj   another.  This is the logical combination of a call to gomp_ordered_last
*38fd1498Szrj   followed by a call to gomp_ordered_first.  The work-share lock must be
*38fd1498Szrj   held on entry. */
*38fd1498Szrj
*38fd1498Szrjvoid
*38fd1498Szrjgomp_ordered_next (void)
*38fd1498Szrj{
*38fd1498Szrj  struct gomp_thread *thr = gomp_thread ();
*38fd1498Szrj  struct gomp_team *team = thr->ts.team;
*38fd1498Szrj  struct gomp_work_share *ws = thr->ts.work_share;
*38fd1498Szrj  unsigned index, next_id;
*38fd1498Szrj
*38fd1498Szrj  /* Work share constructs can be orphaned.  */
*38fd1498Szrj  if (team == NULL || team->nthreads == 1)
*38fd1498Szrj    return;
*38fd1498Szrj
*38fd1498Szrj  /* We're no longer the owner.  */
*38fd1498Szrj  ws->ordered_owner = -1;
*38fd1498Szrj
*38fd1498Szrj  /* If there's only one thread in the queue, that must be us.  */
*38fd1498Szrj  if (ws->ordered_num_used == 1)
*38fd1498Szrj    {
*38fd1498Szrj      /* We have a similar situation as in gomp_ordered_first
*38fd1498Szrj	 where we need to post to our own release semaphore.  */
*38fd1498Szrj      gomp_sem_post (team->ordered_release[thr->ts.team_id]);
*38fd1498Szrj      return;
*38fd1498Szrj    }
*38fd1498Szrj
*38fd1498Szrj  /* If the queue is entirely full, then we move ourself to the end of
*38fd1498Szrj     the queue merely by incrementing ordered_cur.  Only if it's not
*38fd1498Szrj     full do we have to write our id.  */
*38fd1498Szrj  if (ws->ordered_num_used < team->nthreads)
*38fd1498Szrj    {
*38fd1498Szrj      index = ws->ordered_cur + ws->ordered_num_used;
*38fd1498Szrj      if (index >= team->nthreads)
*38fd1498Szrj	index -= team->nthreads;
*38fd1498Szrj      ws->ordered_team_ids[index] = thr->ts.team_id;
*38fd1498Szrj    }
*38fd1498Szrj
*38fd1498Szrj  index = ws->ordered_cur + 1;
*38fd1498Szrj  if (index == team->nthreads)
*38fd1498Szrj    index = 0;
*38fd1498Szrj  ws->ordered_cur = index;
*38fd1498Szrj
*38fd1498Szrj  next_id = ws->ordered_team_ids[index];
*38fd1498Szrj  gomp_sem_post (team->ordered_release[next_id]);
*38fd1498Szrj}
*38fd1498Szrj
*38fd1498Szrj
*38fd1498Szrj/* This function is called when a statically scheduled loop is first
*38fd1498Szrj   being created.  */
*38fd1498Szrj
*38fd1498Szrjvoid
*38fd1498Szrjgomp_ordered_static_init (void)
*38fd1498Szrj{
*38fd1498Szrj  struct gomp_thread *thr = gomp_thread ();
*38fd1498Szrj  struct gomp_team *team = thr->ts.team;
*38fd1498Szrj
*38fd1498Szrj  if (team == NULL || team->nthreads == 1)
*38fd1498Szrj    return;
*38fd1498Szrj
*38fd1498Szrj  gomp_sem_post (team->ordered_release[0]);
*38fd1498Szrj}
*38fd1498Szrj
*38fd1498Szrj/* This function is called when a statically scheduled loop is moving to
*38fd1498Szrj   the next allocation block.  Static schedules are not first come first
*38fd1498Szrj   served like the others, so we're to move to the numerically next thread,
*38fd1498Szrj   not the next thread on a list.  The work-share lock should *not* be held
*38fd1498Szrj   on entry.  */
*38fd1498Szrj
*38fd1498Szrjvoid
*38fd1498Szrjgomp_ordered_static_next (void)
*38fd1498Szrj{
*38fd1498Szrj  struct gomp_thread *thr = gomp_thread ();
*38fd1498Szrj  struct gomp_team *team = thr->ts.team;
*38fd1498Szrj  struct gomp_work_share *ws = thr->ts.work_share;
*38fd1498Szrj  unsigned id = thr->ts.team_id;
*38fd1498Szrj
*38fd1498Szrj  if (team == NULL || team->nthreads == 1)
*38fd1498Szrj    return;
*38fd1498Szrj
*38fd1498Szrj  ws->ordered_owner = -1;
*38fd1498Szrj
*38fd1498Szrj  /* This thread currently owns the lock.  Increment the owner.  */
*38fd1498Szrj  if (++id == team->nthreads)
*38fd1498Szrj    id = 0;
*38fd1498Szrj  ws->ordered_team_ids[0] = id;
*38fd1498Szrj  gomp_sem_post (team->ordered_release[id]);
*38fd1498Szrj}
*38fd1498Szrj
*38fd1498Szrj/* This function is called when we need to assert that the thread owns the
*38fd1498Szrj   ordered section.  Due to the problem of posted-but-not-waited semaphores,
*38fd1498Szrj   this needs to happen before completing a loop iteration.  */
*38fd1498Szrj
*38fd1498Szrjvoid
*38fd1498Szrjgomp_ordered_sync (void)
*38fd1498Szrj{
*38fd1498Szrj  struct gomp_thread *thr = gomp_thread ();
*38fd1498Szrj  struct gomp_team *team = thr->ts.team;
*38fd1498Szrj  struct gomp_work_share *ws = thr->ts.work_share;
*38fd1498Szrj
*38fd1498Szrj  /* Work share constructs can be orphaned.  But this clearly means that
*38fd1498Szrj     we are the only thread, and so we automatically own the section.  */
*38fd1498Szrj  if (team == NULL || team->nthreads == 1)
*38fd1498Szrj    return;
*38fd1498Szrj
*38fd1498Szrj  /* ??? I believe it to be safe to access this data without taking the
*38fd1498Szrj     ws->lock.  The only presumed race condition is with the previous
*38fd1498Szrj     thread on the queue incrementing ordered_cur such that it points
*38fd1498Szrj     to us, concurrently with our check below.  But our team_id is
*38fd1498Szrj     already present in the queue, and the other thread will always
*38fd1498Szrj     post to our release semaphore.  So the two cases are that we will
*38fd1498Szrj     either win the race an momentarily block on the semaphore, or lose
*38fd1498Szrj     the race and find the semaphore already unlocked and so not block.
*38fd1498Szrj     Either way we get correct results.
*38fd1498Szrj     However, there is an implicit flush on entry to an ordered region,
*38fd1498Szrj     so we do need to have a barrier here.  If we were taking a lock
*38fd1498Szrj     this could be MEMMODEL_RELEASE since the acquire would be coverd
*38fd1498Szrj     by the lock.  */
*38fd1498Szrj
*38fd1498Szrj  __atomic_thread_fence (MEMMODEL_ACQ_REL);
*38fd1498Szrj  if (ws->ordered_owner != thr->ts.team_id)
*38fd1498Szrj    {
*38fd1498Szrj      gomp_sem_wait (team->ordered_release[thr->ts.team_id]);
*38fd1498Szrj      ws->ordered_owner = thr->ts.team_id;
*38fd1498Szrj    }
*38fd1498Szrj}
*38fd1498Szrj
*38fd1498Szrj/* This function is called by user code when encountering the start of an
*38fd1498Szrj   ORDERED block.  We must check to see if the current thread is at the
*38fd1498Szrj   head of the queue, and if not, block.  */
*38fd1498Szrj
*38fd1498Szrj#ifdef HAVE_ATTRIBUTE_ALIAS
*38fd1498Szrjextern void GOMP_ordered_start (void)
*38fd1498Szrj	__attribute__((alias ("gomp_ordered_sync")));
*38fd1498Szrj#else
*38fd1498Szrjvoid
*38fd1498SzrjGOMP_ordered_start (void)
*38fd1498Szrj{
*38fd1498Szrj  gomp_ordered_sync ();
*38fd1498Szrj}
*38fd1498Szrj#endif
*38fd1498Szrj
*38fd1498Szrj/* This function is called by user code when encountering the end of an
*38fd1498Szrj   ORDERED block.  With the current ORDERED implementation there's nothing
*38fd1498Szrj   for us to do.
*38fd1498Szrj
*38fd1498Szrj   However, the current implementation has a flaw in that it does not allow
*38fd1498Szrj   the next thread into the ORDERED section immediately after the current
*38fd1498Szrj   thread exits the ORDERED section in its last iteration.  The existance
*38fd1498Szrj   of this function allows the implementation to change.  */
*38fd1498Szrj
*38fd1498Szrjvoid
*38fd1498SzrjGOMP_ordered_end (void)
*38fd1498Szrj{
*38fd1498Szrj}
*38fd1498Szrj
*38fd1498Szrj/* DOACROSS initialization.  */
*38fd1498Szrj
*38fd1498Szrj#define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
*38fd1498Szrj
*38fd1498Szrjvoid
*38fd1498Szrjgomp_doacross_init (unsigned ncounts, long *counts, long chunk_size)
*38fd1498Szrj{
*38fd1498Szrj  struct gomp_thread *thr = gomp_thread ();
*38fd1498Szrj  struct gomp_team *team = thr->ts.team;
*38fd1498Szrj  struct gomp_work_share *ws = thr->ts.work_share;
*38fd1498Szrj  unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
*38fd1498Szrj  unsigned long ent, num_ents, elt_sz, shift_sz;
*38fd1498Szrj  struct gomp_doacross_work_share *doacross;
*38fd1498Szrj
*38fd1498Szrj  if (team == NULL || team->nthreads == 1)
*38fd1498Szrj    return;
*38fd1498Szrj
*38fd1498Szrj  for (i = 0; i < ncounts; i++)
*38fd1498Szrj    {
*38fd1498Szrj      /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
*38fd1498Szrj      if (counts[i] == 0)
*38fd1498Szrj	return;
*38fd1498Szrj
*38fd1498Szrj      if (num_bits <= MAX_COLLAPSED_BITS)
*38fd1498Szrj	{
*38fd1498Szrj	  unsigned int this_bits;
*38fd1498Szrj	  if (counts[i] == 1)
*38fd1498Szrj	    this_bits = 1;
*38fd1498Szrj	  else
*38fd1498Szrj	    this_bits = __SIZEOF_LONG__ * __CHAR_BIT__
*38fd1498Szrj			- __builtin_clzl (counts[i] - 1);
*38fd1498Szrj	  if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
*38fd1498Szrj	    {
*38fd1498Szrj	      bits[i] = this_bits;
*38fd1498Szrj	      num_bits += this_bits;
*38fd1498Szrj	    }
*38fd1498Szrj	  else
*38fd1498Szrj	    num_bits = MAX_COLLAPSED_BITS + 1;
*38fd1498Szrj	}
*38fd1498Szrj    }
*38fd1498Szrj
*38fd1498Szrj  if (ws->sched == GFS_STATIC)
*38fd1498Szrj    num_ents = team->nthreads;
*38fd1498Szrj  else if (ws->sched == GFS_GUIDED)
*38fd1498Szrj    num_ents = counts[0];
*38fd1498Szrj  else
*38fd1498Szrj    num_ents = (counts[0] - 1) / chunk_size + 1;
*38fd1498Szrj  if (num_bits <= MAX_COLLAPSED_BITS)
*38fd1498Szrj    {
*38fd1498Szrj      elt_sz = sizeof (unsigned long);
*38fd1498Szrj      shift_sz = ncounts * sizeof (unsigned int);
*38fd1498Szrj    }
*38fd1498Szrj  else
*38fd1498Szrj    {
*38fd1498Szrj      elt_sz = sizeof (unsigned long) * ncounts;
*38fd1498Szrj      shift_sz = 0;
*38fd1498Szrj    }
*38fd1498Szrj  elt_sz = (elt_sz + 63) & ~63UL;
*38fd1498Szrj
*38fd1498Szrj  doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
*38fd1498Szrj			  + shift_sz);
*38fd1498Szrj  doacross->chunk_size = chunk_size;
*38fd1498Szrj  doacross->elt_sz = elt_sz;
*38fd1498Szrj  doacross->ncounts = ncounts;
*38fd1498Szrj  doacross->flattened = false;
*38fd1498Szrj  doacross->array = (unsigned char *)
*38fd1498Szrj		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
*38fd1498Szrj		     & ~(uintptr_t) 63);
*38fd1498Szrj  if (num_bits <= MAX_COLLAPSED_BITS)
*38fd1498Szrj    {
*38fd1498Szrj      unsigned int shift_count = 0;
*38fd1498Szrj      doacross->flattened = true;
*38fd1498Szrj      for (i = ncounts; i > 0; i--)
*38fd1498Szrj	{
*38fd1498Szrj	  doacross->shift_counts[i - 1] = shift_count;
*38fd1498Szrj	  shift_count += bits[i - 1];
*38fd1498Szrj	}
*38fd1498Szrj      for (ent = 0; ent < num_ents; ent++)
*38fd1498Szrj	*(unsigned long *) (doacross->array + ent * elt_sz) = 0;
*38fd1498Szrj    }
*38fd1498Szrj  else
*38fd1498Szrj    for (ent = 0; ent < num_ents; ent++)
*38fd1498Szrj      memset (doacross->array + ent * elt_sz, '\0',
*38fd1498Szrj	      sizeof (unsigned long) * ncounts);
*38fd1498Szrj  if (ws->sched == GFS_STATIC && chunk_size == 0)
*38fd1498Szrj    {
*38fd1498Szrj      unsigned long q = counts[0] / num_ents;
*38fd1498Szrj      unsigned long t = counts[0] % num_ents;
*38fd1498Szrj      doacross->boundary = t * (q + 1);
*38fd1498Szrj      doacross->q = q;
*38fd1498Szrj      doacross->t = t;
*38fd1498Szrj    }
*38fd1498Szrj  ws->doacross = doacross;
*38fd1498Szrj}
*38fd1498Szrj
*38fd1498Szrj/* DOACROSS POST operation.  */
*38fd1498Szrj
*38fd1498Szrjvoid
*38fd1498SzrjGOMP_doacross_post (long *counts)
*38fd1498Szrj{
*38fd1498Szrj  struct gomp_thread *thr = gomp_thread ();
*38fd1498Szrj  struct gomp_work_share *ws = thr->ts.work_share;
*38fd1498Szrj  struct gomp_doacross_work_share *doacross = ws->doacross;
*38fd1498Szrj  unsigned long ent;
*38fd1498Szrj  unsigned int i;
*38fd1498Szrj
*38fd1498Szrj  if (__builtin_expect (doacross == NULL, 0))
*38fd1498Szrj    {
*38fd1498Szrj      __sync_synchronize ();
*38fd1498Szrj      return;
*38fd1498Szrj    }
*38fd1498Szrj
*38fd1498Szrj  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
*38fd1498Szrj    ent = thr->ts.team_id;
*38fd1498Szrj  else if (ws->sched == GFS_GUIDED)
*38fd1498Szrj    ent = counts[0];
*38fd1498Szrj  else
*38fd1498Szrj    ent = counts[0] / doacross->chunk_size;
*38fd1498Szrj  unsigned long *array = (unsigned long *) (doacross->array
*38fd1498Szrj					    + ent * doacross->elt_sz);
*38fd1498Szrj
*38fd1498Szrj  if (__builtin_expect (doacross->flattened, 1))
*38fd1498Szrj    {
*38fd1498Szrj      unsigned long flattened
*38fd1498Szrj	= (unsigned long) counts[0] << doacross->shift_counts[0];
*38fd1498Szrj
*38fd1498Szrj      for (i = 1; i < doacross->ncounts; i++)
*38fd1498Szrj	flattened |= (unsigned long) counts[i]
*38fd1498Szrj		     << doacross->shift_counts[i];
*38fd1498Szrj      flattened++;
*38fd1498Szrj      if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
*38fd1498Szrj	__atomic_thread_fence (MEMMODEL_RELEASE);
*38fd1498Szrj      else
*38fd1498Szrj	__atomic_store_n (array, flattened, MEMMODEL_RELEASE);
*38fd1498Szrj      return;
*38fd1498Szrj    }
*38fd1498Szrj
*38fd1498Szrj  __atomic_thread_fence (MEMMODEL_ACQUIRE);
*38fd1498Szrj  for (i = doacross->ncounts; i-- > 0; )
*38fd1498Szrj    {
*38fd1498Szrj      if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
*38fd1498Szrj	__atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
*38fd1498Szrj    }
*38fd1498Szrj}
*38fd1498Szrj
*38fd1498Szrj/* DOACROSS WAIT operation.  */
*38fd1498Szrj
*38fd1498Szrjvoid
*38fd1498SzrjGOMP_doacross_wait (long first, ...)
*38fd1498Szrj{
*38fd1498Szrj  struct gomp_thread *thr = gomp_thread ();
*38fd1498Szrj  struct gomp_work_share *ws = thr->ts.work_share;
*38fd1498Szrj  struct gomp_doacross_work_share *doacross = ws->doacross;
*38fd1498Szrj  va_list ap;
*38fd1498Szrj  unsigned long ent;
*38fd1498Szrj  unsigned int i;
*38fd1498Szrj
*38fd1498Szrj  if (__builtin_expect (doacross == NULL, 0))
*38fd1498Szrj    {
*38fd1498Szrj      __sync_synchronize ();
*38fd1498Szrj      return;
*38fd1498Szrj    }
*38fd1498Szrj
*38fd1498Szrj  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
*38fd1498Szrj    {
*38fd1498Szrj      if (ws->chunk_size == 0)
*38fd1498Szrj	{
*38fd1498Szrj	  if (first < doacross->boundary)
*38fd1498Szrj	    ent = first / (doacross->q + 1);
*38fd1498Szrj	  else
*38fd1498Szrj	    ent = (first - doacross->boundary) / doacross->q
*38fd1498Szrj		  + doacross->t;
*38fd1498Szrj	}
*38fd1498Szrj      else
*38fd1498Szrj	ent = first / ws->chunk_size % thr->ts.team->nthreads;
*38fd1498Szrj    }
*38fd1498Szrj  else if (ws->sched == GFS_GUIDED)
*38fd1498Szrj    ent = first;
*38fd1498Szrj  else
*38fd1498Szrj    ent = first / doacross->chunk_size;
*38fd1498Szrj  unsigned long *array = (unsigned long *) (doacross->array
*38fd1498Szrj					    + ent * doacross->elt_sz);
*38fd1498Szrj
*38fd1498Szrj  if (__builtin_expect (doacross->flattened, 1))
*38fd1498Szrj    {
*38fd1498Szrj      unsigned long flattened
*38fd1498Szrj	= (unsigned long) first << doacross->shift_counts[0];
*38fd1498Szrj      unsigned long cur;
*38fd1498Szrj
*38fd1498Szrj      va_start (ap, first);
*38fd1498Szrj      for (i = 1; i < doacross->ncounts; i++)
*38fd1498Szrj	flattened |= (unsigned long) va_arg (ap, long)
*38fd1498Szrj		     << doacross->shift_counts[i];
*38fd1498Szrj      cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
*38fd1498Szrj      if (flattened < cur)
*38fd1498Szrj	{
*38fd1498Szrj	  __atomic_thread_fence (MEMMODEL_RELEASE);
*38fd1498Szrj	  va_end (ap);
*38fd1498Szrj	  return;
*38fd1498Szrj	}
*38fd1498Szrj      doacross_spin (array, flattened, cur);
*38fd1498Szrj      __atomic_thread_fence (MEMMODEL_RELEASE);
*38fd1498Szrj      va_end (ap);
*38fd1498Szrj      return;
*38fd1498Szrj    }
*38fd1498Szrj
*38fd1498Szrj  do
*38fd1498Szrj    {
*38fd1498Szrj      va_start (ap, first);
*38fd1498Szrj      for (i = 0; i < doacross->ncounts; i++)
*38fd1498Szrj	{
*38fd1498Szrj	  unsigned long thisv
*38fd1498Szrj	    = (unsigned long) (i ? va_arg (ap, long) : first) + 1;
*38fd1498Szrj	  unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
*38fd1498Szrj	  if (thisv < cur)
*38fd1498Szrj	    {
*38fd1498Szrj	      i = doacross->ncounts;
*38fd1498Szrj	      break;
*38fd1498Szrj	    }
*38fd1498Szrj	  if (thisv > cur)
*38fd1498Szrj	    break;
*38fd1498Szrj	}
*38fd1498Szrj      va_end (ap);
*38fd1498Szrj      if (i == doacross->ncounts)
*38fd1498Szrj	break;
*38fd1498Szrj      cpu_relax ();
*38fd1498Szrj    }
*38fd1498Szrj  while (1);
*38fd1498Szrj  __sync_synchronize ();
*38fd1498Szrj}
*38fd1498Szrj
*38fd1498Szrjtypedef unsigned long long gomp_ull;
*38fd1498Szrj
*38fd1498Szrjvoid
*38fd1498Szrjgomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size)
*38fd1498Szrj{
*38fd1498Szrj  struct gomp_thread *thr = gomp_thread ();
*38fd1498Szrj  struct gomp_team *team = thr->ts.team;
*38fd1498Szrj  struct gomp_work_share *ws = thr->ts.work_share;
*38fd1498Szrj  unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
*38fd1498Szrj  unsigned long ent, num_ents, elt_sz, shift_sz;
*38fd1498Szrj  struct gomp_doacross_work_share *doacross;
*38fd1498Szrj
*38fd1498Szrj  if (team == NULL || team->nthreads == 1)
*38fd1498Szrj    return;
*38fd1498Szrj
*38fd1498Szrj  for (i = 0; i < ncounts; i++)
*38fd1498Szrj    {
*38fd1498Szrj      /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
*38fd1498Szrj      if (counts[i] == 0)
*38fd1498Szrj	return;
*38fd1498Szrj
*38fd1498Szrj      if (num_bits <= MAX_COLLAPSED_BITS)
*38fd1498Szrj	{
*38fd1498Szrj	  unsigned int this_bits;
*38fd1498Szrj	  if (counts[i] == 1)
*38fd1498Szrj	    this_bits = 1;
*38fd1498Szrj	  else
*38fd1498Szrj	    this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__
*38fd1498Szrj			- __builtin_clzll (counts[i] - 1);
*38fd1498Szrj	  if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
*38fd1498Szrj	    {
*38fd1498Szrj	      bits[i] = this_bits;
*38fd1498Szrj	      num_bits += this_bits;
*38fd1498Szrj	    }
*38fd1498Szrj	  else
*38fd1498Szrj	    num_bits = MAX_COLLAPSED_BITS + 1;
*38fd1498Szrj	}
*38fd1498Szrj    }
*38fd1498Szrj
*38fd1498Szrj  if (ws->sched == GFS_STATIC)
*38fd1498Szrj    num_ents = team->nthreads;
*38fd1498Szrj  else if (ws->sched == GFS_GUIDED)
*38fd1498Szrj    num_ents = counts[0];
*38fd1498Szrj  else
*38fd1498Szrj    num_ents = (counts[0] - 1) / chunk_size + 1;
*38fd1498Szrj  if (num_bits <= MAX_COLLAPSED_BITS)
*38fd1498Szrj    {
*38fd1498Szrj      elt_sz = sizeof (unsigned long);
*38fd1498Szrj      shift_sz = ncounts * sizeof (unsigned int);
*38fd1498Szrj    }
*38fd1498Szrj  else
*38fd1498Szrj    {
*38fd1498Szrj      if (sizeof (gomp_ull) == sizeof (unsigned long))
*38fd1498Szrj	elt_sz = sizeof (gomp_ull) * ncounts;
*38fd1498Szrj      else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long))
*38fd1498Szrj	elt_sz = sizeof (unsigned long) * 2 * ncounts;
*38fd1498Szrj      else
*38fd1498Szrj	abort ();
*38fd1498Szrj      shift_sz = 0;
*38fd1498Szrj    }
*38fd1498Szrj  elt_sz = (elt_sz + 63) & ~63UL;
*38fd1498Szrj
*38fd1498Szrj  doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
*38fd1498Szrj			  + shift_sz);
*38fd1498Szrj  doacross->chunk_size_ull = chunk_size;
*38fd1498Szrj  doacross->elt_sz = elt_sz;
*38fd1498Szrj  doacross->ncounts = ncounts;
*38fd1498Szrj  doacross->flattened = false;
*38fd1498Szrj  doacross->boundary = 0;
*38fd1498Szrj  doacross->array = (unsigned char *)
*38fd1498Szrj		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
*38fd1498Szrj		     & ~(uintptr_t) 63);
*38fd1498Szrj  if (num_bits <= MAX_COLLAPSED_BITS)
*38fd1498Szrj    {
*38fd1498Szrj      unsigned int shift_count = 0;
*38fd1498Szrj      doacross->flattened = true;
*38fd1498Szrj      for (i = ncounts; i > 0; i--)
*38fd1498Szrj	{
*38fd1498Szrj	  doacross->shift_counts[i - 1] = shift_count;
*38fd1498Szrj	  shift_count += bits[i - 1];
*38fd1498Szrj	}
*38fd1498Szrj      for (ent = 0; ent < num_ents; ent++)
*38fd1498Szrj	*(unsigned long *) (doacross->array + ent * elt_sz) = 0;
*38fd1498Szrj    }
*38fd1498Szrj  else
*38fd1498Szrj    for (ent = 0; ent < num_ents; ent++)
*38fd1498Szrj      memset (doacross->array + ent * elt_sz, '\0',
*38fd1498Szrj	      sizeof (unsigned long) * ncounts);
*38fd1498Szrj  if (ws->sched == GFS_STATIC && chunk_size == 0)
*38fd1498Szrj    {
*38fd1498Szrj      gomp_ull q = counts[0] / num_ents;
*38fd1498Szrj      gomp_ull t = counts[0] % num_ents;
*38fd1498Szrj      doacross->boundary_ull = t * (q + 1);
*38fd1498Szrj      doacross->q_ull = q;
*38fd1498Szrj      doacross->t = t;
*38fd1498Szrj    }
*38fd1498Szrj  ws->doacross = doacross;
*38fd1498Szrj}
*38fd1498Szrj
*38fd1498Szrj/* DOACROSS POST operation.  */
*38fd1498Szrj
*38fd1498Szrjvoid
*38fd1498SzrjGOMP_doacross_ull_post (gomp_ull *counts)
*38fd1498Szrj{
*38fd1498Szrj  struct gomp_thread *thr = gomp_thread ();
*38fd1498Szrj  struct gomp_work_share *ws = thr->ts.work_share;
*38fd1498Szrj  struct gomp_doacross_work_share *doacross = ws->doacross;
*38fd1498Szrj  unsigned long ent;
*38fd1498Szrj  unsigned int i;
*38fd1498Szrj
*38fd1498Szrj  if (__builtin_expect (doacross == NULL, 0))
*38fd1498Szrj    {
*38fd1498Szrj      __sync_synchronize ();
*38fd1498Szrj      return;
*38fd1498Szrj    }
*38fd1498Szrj
*38fd1498Szrj  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
*38fd1498Szrj    ent = thr->ts.team_id;
*38fd1498Szrj  else if (ws->sched == GFS_GUIDED)
*38fd1498Szrj    ent = counts[0];
*38fd1498Szrj  else
*38fd1498Szrj    ent = counts[0] / doacross->chunk_size_ull;
*38fd1498Szrj
*38fd1498Szrj  if (__builtin_expect (doacross->flattened, 1))
*38fd1498Szrj    {
*38fd1498Szrj      unsigned long *array = (unsigned long *) (doacross->array
*38fd1498Szrj			      + ent * doacross->elt_sz);
*38fd1498Szrj      gomp_ull flattened
*38fd1498Szrj	= counts[0] << doacross->shift_counts[0];
*38fd1498Szrj
*38fd1498Szrj      for (i = 1; i < doacross->ncounts; i++)
*38fd1498Szrj	flattened |= counts[i] << doacross->shift_counts[i];
*38fd1498Szrj      flattened++;
*38fd1498Szrj      if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
*38fd1498Szrj	__atomic_thread_fence (MEMMODEL_RELEASE);
*38fd1498Szrj      else
*38fd1498Szrj	__atomic_store_n (array, flattened, MEMMODEL_RELEASE);
*38fd1498Szrj      return;
*38fd1498Szrj    }
*38fd1498Szrj
*38fd1498Szrj  __atomic_thread_fence (MEMMODEL_ACQUIRE);
*38fd1498Szrj  if (sizeof (gomp_ull) == sizeof (unsigned long))
*38fd1498Szrj    {
*38fd1498Szrj      gomp_ull *array = (gomp_ull *) (doacross->array
*38fd1498Szrj				      + ent * doacross->elt_sz);
*38fd1498Szrj
*38fd1498Szrj      for (i = doacross->ncounts; i-- > 0; )
*38fd1498Szrj	{
*38fd1498Szrj	  if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
*38fd1498Szrj	    __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
*38fd1498Szrj	}
*38fd1498Szrj    }
*38fd1498Szrj  else
*38fd1498Szrj    {
*38fd1498Szrj      unsigned long *array = (unsigned long *) (doacross->array
*38fd1498Szrj						+ ent * doacross->elt_sz);
*38fd1498Szrj
*38fd1498Szrj      for (i = doacross->ncounts; i-- > 0; )
*38fd1498Szrj	{
*38fd1498Szrj	  gomp_ull cull = counts[i] + 1UL;
*38fd1498Szrj	  unsigned long c = (unsigned long) cull;
*38fd1498Szrj	  if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED))
*38fd1498Szrj	    __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE);
*38fd1498Szrj	  c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
*38fd1498Szrj	  if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED))
*38fd1498Szrj	    __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE);
*38fd1498Szrj	}
*38fd1498Szrj    }
*38fd1498Szrj}
*38fd1498Szrj
*38fd1498Szrj/* DOACROSS WAIT operation.  */
*38fd1498Szrj
*38fd1498Szrjvoid
*38fd1498SzrjGOMP_doacross_ull_wait (gomp_ull first, ...)
*38fd1498Szrj{
*38fd1498Szrj  struct gomp_thread *thr = gomp_thread ();
*38fd1498Szrj  struct gomp_work_share *ws = thr->ts.work_share;
*38fd1498Szrj  struct gomp_doacross_work_share *doacross = ws->doacross;
*38fd1498Szrj  va_list ap;
*38fd1498Szrj  unsigned long ent;
*38fd1498Szrj  unsigned int i;
*38fd1498Szrj
*38fd1498Szrj  if (__builtin_expect (doacross == NULL, 0))
*38fd1498Szrj    {
*38fd1498Szrj      __sync_synchronize ();
*38fd1498Szrj      return;
*38fd1498Szrj    }
*38fd1498Szrj
*38fd1498Szrj  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
*38fd1498Szrj    {
*38fd1498Szrj      if (ws->chunk_size_ull == 0)
*38fd1498Szrj	{
*38fd1498Szrj	  if (first < doacross->boundary_ull)
*38fd1498Szrj	    ent = first / (doacross->q_ull + 1);
*38fd1498Szrj	  else
*38fd1498Szrj	    ent = (first - doacross->boundary_ull) / doacross->q_ull
*38fd1498Szrj		  + doacross->t;
*38fd1498Szrj	}
*38fd1498Szrj      else
*38fd1498Szrj	ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
*38fd1498Szrj    }
*38fd1498Szrj  else if (ws->sched == GFS_GUIDED)
*38fd1498Szrj    ent = first;
*38fd1498Szrj  else
*38fd1498Szrj    ent = first / doacross->chunk_size_ull;
*38fd1498Szrj
*38fd1498Szrj  if (__builtin_expect (doacross->flattened, 1))
*38fd1498Szrj    {
*38fd1498Szrj      unsigned long *array = (unsigned long *) (doacross->array
*38fd1498Szrj						+ ent * doacross->elt_sz);
*38fd1498Szrj      gomp_ull flattened = first << doacross->shift_counts[0];
*38fd1498Szrj      unsigned long cur;
*38fd1498Szrj
*38fd1498Szrj      va_start (ap, first);
*38fd1498Szrj      for (i = 1; i < doacross->ncounts; i++)
*38fd1498Szrj	flattened |= va_arg (ap, gomp_ull)
*38fd1498Szrj		     << doacross->shift_counts[i];
*38fd1498Szrj      cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
*38fd1498Szrj      if (flattened < cur)
*38fd1498Szrj	{
*38fd1498Szrj	  __atomic_thread_fence (MEMMODEL_RELEASE);
*38fd1498Szrj	  va_end (ap);
*38fd1498Szrj	  return;
*38fd1498Szrj	}
*38fd1498Szrj      doacross_spin (array, flattened, cur);
*38fd1498Szrj      __atomic_thread_fence (MEMMODEL_RELEASE);
*38fd1498Szrj      va_end (ap);
*38fd1498Szrj      return;
*38fd1498Szrj    }
*38fd1498Szrj
*38fd1498Szrj  if (sizeof (gomp_ull) == sizeof (unsigned long))
*38fd1498Szrj    {
*38fd1498Szrj      gomp_ull *array = (gomp_ull *) (doacross->array
*38fd1498Szrj				      + ent * doacross->elt_sz);
*38fd1498Szrj      do
*38fd1498Szrj	{
*38fd1498Szrj	  va_start (ap, first);
*38fd1498Szrj	  for (i = 0; i < doacross->ncounts; i++)
*38fd1498Szrj	    {
*38fd1498Szrj	      gomp_ull thisv
*38fd1498Szrj		= (i ? va_arg (ap, gomp_ull) : first) + 1;
*38fd1498Szrj	      gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
*38fd1498Szrj	      if (thisv < cur)
*38fd1498Szrj		{
*38fd1498Szrj		  i = doacross->ncounts;
*38fd1498Szrj		  break;
*38fd1498Szrj		}
*38fd1498Szrj	      if (thisv > cur)
*38fd1498Szrj		break;
*38fd1498Szrj	    }
*38fd1498Szrj	  va_end (ap);
*38fd1498Szrj	  if (i == doacross->ncounts)
*38fd1498Szrj	    break;
*38fd1498Szrj	  cpu_relax ();
*38fd1498Szrj	}
*38fd1498Szrj      while (1);
*38fd1498Szrj    }
*38fd1498Szrj  else
*38fd1498Szrj    {
*38fd1498Szrj      unsigned long *array = (unsigned long *) (doacross->array
*38fd1498Szrj						+ ent * doacross->elt_sz);
*38fd1498Szrj      do
*38fd1498Szrj	{
*38fd1498Szrj	  va_start (ap, first);
*38fd1498Szrj	  for (i = 0; i < doacross->ncounts; i++)
*38fd1498Szrj	    {
*38fd1498Szrj	      gomp_ull thisv
*38fd1498Szrj		= (i ? va_arg (ap, gomp_ull) : first) + 1;
*38fd1498Szrj	      unsigned long t
*38fd1498Szrj		= thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
*38fd1498Szrj	      unsigned long cur
*38fd1498Szrj		= __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED);
*38fd1498Szrj	      if (t < cur)
*38fd1498Szrj		{
*38fd1498Szrj		  i = doacross->ncounts;
*38fd1498Szrj		  break;
*38fd1498Szrj		}
*38fd1498Szrj	      if (t > cur)
*38fd1498Szrj		break;
*38fd1498Szrj	      t = thisv;
*38fd1498Szrj	      cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED);
*38fd1498Szrj	      if (t < cur)
*38fd1498Szrj		{
*38fd1498Szrj		  i = doacross->ncounts;
*38fd1498Szrj		  break;
*38fd1498Szrj		}
*38fd1498Szrj	      if (t > cur)
*38fd1498Szrj		break;
*38fd1498Szrj	    }
*38fd1498Szrj	  va_end (ap);
*38fd1498Szrj	  if (i == doacross->ncounts)
*38fd1498Szrj	    break;
*38fd1498Szrj	  cpu_relax ();
*38fd1498Szrj	}
*38fd1498Szrj      while (1);
*38fd1498Szrj    }
*38fd1498Szrj  __sync_synchronize ();
*38fd1498Szrj}