dist/libgomp/iter_ull.c

*8feb0f0bSmrg/* Copyright (C) 2005-2020 Free Software Foundation, Inc.
1debfc3dSmrg   Contributed by Richard Henderson <rth@redhat.com>.
1debfc3dSmrg
1debfc3dSmrg   This file is part of the GNU Offloading and Multi Processing Library
1debfc3dSmrg   (libgomp).
1debfc3dSmrg
1debfc3dSmrg   Libgomp is free software; you can redistribute it and/or modify it
1debfc3dSmrg   under the terms of the GNU General Public License as published by
1debfc3dSmrg   the Free Software Foundation; either version 3, or (at your option)
1debfc3dSmrg   any later version.
1debfc3dSmrg
1debfc3dSmrg   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
1debfc3dSmrg   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1debfc3dSmrg   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
1debfc3dSmrg   more details.
1debfc3dSmrg
1debfc3dSmrg   Under Section 7 of GPL version 3, you are granted additional
1debfc3dSmrg   permissions described in the GCC Runtime Library Exception, version
1debfc3dSmrg   3.1, as published by the Free Software Foundation.
1debfc3dSmrg
1debfc3dSmrg   You should have received a copy of the GNU General Public License and
1debfc3dSmrg   a copy of the GCC Runtime Library Exception along with this program;
1debfc3dSmrg   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
1debfc3dSmrg   <http://www.gnu.org/licenses/>.  */
1debfc3dSmrg
1debfc3dSmrg/* This file contains routines for managing work-share iteration, both
1debfc3dSmrg   for loops and sections.  */
1debfc3dSmrg
1debfc3dSmrg#include "libgomp.h"
1debfc3dSmrg#include <stdlib.h>
1debfc3dSmrg
1debfc3dSmrgtypedef unsigned long long gomp_ull;
1debfc3dSmrg
1debfc3dSmrg/* This function implements the STATIC scheduling method.  The caller should
1debfc3dSmrg   iterate *pstart <= x < *pend.  Return zero if there are more iterations
1debfc3dSmrg   to perform; nonzero if not.  Return less than 0 if this thread had
1debfc3dSmrg   received the absolutely last iteration.  */
1debfc3dSmrg
1debfc3dSmrgint
1debfc3dSmrggomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
1debfc3dSmrg{
1debfc3dSmrg  struct gomp_thread *thr = gomp_thread ();
1debfc3dSmrg  struct gomp_team *team = thr->ts.team;
1debfc3dSmrg  struct gomp_work_share *ws = thr->ts.work_share;
1debfc3dSmrg  unsigned long nthreads = team ? team->nthreads : 1;
1debfc3dSmrg
1debfc3dSmrg  if (thr->ts.static_trip == -1)
1debfc3dSmrg    return -1;
1debfc3dSmrg
1debfc3dSmrg  /* Quick test for degenerate teams and orphaned constructs.  */
1debfc3dSmrg  if (nthreads == 1)
1debfc3dSmrg    {
1debfc3dSmrg      *pstart = ws->next_ull;
1debfc3dSmrg      *pend = ws->end_ull;
1debfc3dSmrg      thr->ts.static_trip = -1;
1debfc3dSmrg      return ws->next_ull == ws->end_ull;
1debfc3dSmrg    }
1debfc3dSmrg
1debfc3dSmrg  /* We interpret chunk_size zero as "unspecified", which means that we
1debfc3dSmrg     should break up the iterations such that each thread makes only one
1debfc3dSmrg     trip through the outer loop.  */
1debfc3dSmrg  if (ws->chunk_size_ull == 0)
1debfc3dSmrg    {
1debfc3dSmrg      gomp_ull n, q, i, t, s0, e0, s, e;
1debfc3dSmrg
1debfc3dSmrg      if (thr->ts.static_trip > 0)
1debfc3dSmrg	return 1;
1debfc3dSmrg
1debfc3dSmrg      /* Compute the total number of iterations.  */
1debfc3dSmrg      if (__builtin_expect (ws->mode, 0) == 0)
1debfc3dSmrg	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
1debfc3dSmrg      else
1debfc3dSmrg	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
1debfc3dSmrg      i = thr->ts.team_id;
1debfc3dSmrg
1debfc3dSmrg      /* Compute the "zero-based" start and end points.  That is, as
1debfc3dSmrg	 if the loop began at zero and incremented by one.  */
1debfc3dSmrg      q = n / nthreads;
1debfc3dSmrg      t = n % nthreads;
1debfc3dSmrg      if (i < t)
1debfc3dSmrg	{
1debfc3dSmrg	  t = 0;
1debfc3dSmrg	  q++;
1debfc3dSmrg	}
1debfc3dSmrg      s0 = q * i + t;
1debfc3dSmrg      e0 = s0 + q;
1debfc3dSmrg
1debfc3dSmrg      /* Notice when no iterations allocated for this thread.  */
1debfc3dSmrg      if (s0 >= e0)
1debfc3dSmrg	{
1debfc3dSmrg	  thr->ts.static_trip = 1;
1debfc3dSmrg	  return 1;
1debfc3dSmrg	}
1debfc3dSmrg
1debfc3dSmrg      /* Transform these to the actual start and end numbers.  */
1debfc3dSmrg      s = s0 * ws->incr_ull + ws->next_ull;
1debfc3dSmrg      e = e0 * ws->incr_ull + ws->next_ull;
1debfc3dSmrg
1debfc3dSmrg      *pstart = s;
1debfc3dSmrg      *pend = e;
1debfc3dSmrg      thr->ts.static_trip = (e0 == n ? -1 : 1);
1debfc3dSmrg      return 0;
1debfc3dSmrg    }
1debfc3dSmrg  else
1debfc3dSmrg    {
1debfc3dSmrg      gomp_ull n, s0, e0, i, c, s, e;
1debfc3dSmrg
1debfc3dSmrg      /* Otherwise, each thread gets exactly chunk_size iterations
1debfc3dSmrg	 (if available) each time through the loop.  */
1debfc3dSmrg
1debfc3dSmrg      if (__builtin_expect (ws->mode, 0) == 0)
1debfc3dSmrg	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
1debfc3dSmrg      else
1debfc3dSmrg	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
1debfc3dSmrg      i = thr->ts.team_id;
1debfc3dSmrg      c = ws->chunk_size_ull;
1debfc3dSmrg
1debfc3dSmrg      /* Initial guess is a C sized chunk positioned nthreads iterations
1debfc3dSmrg	 in, offset by our thread number.  */
1debfc3dSmrg      s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
1debfc3dSmrg      e0 = s0 + c;
1debfc3dSmrg
1debfc3dSmrg      /* Detect overflow.  */
1debfc3dSmrg      if (s0 >= n)
1debfc3dSmrg	return 1;
1debfc3dSmrg      if (e0 > n)
1debfc3dSmrg	e0 = n;
1debfc3dSmrg
1debfc3dSmrg      /* Transform these to the actual start and end numbers.  */
1debfc3dSmrg      s = s0 * ws->incr_ull + ws->next_ull;
1debfc3dSmrg      e = e0 * ws->incr_ull + ws->next_ull;
1debfc3dSmrg
1debfc3dSmrg      *pstart = s;
1debfc3dSmrg      *pend = e;
1debfc3dSmrg
1debfc3dSmrg      if (e0 == n)
1debfc3dSmrg	thr->ts.static_trip = -1;
1debfc3dSmrg      else
1debfc3dSmrg	thr->ts.static_trip++;
1debfc3dSmrg      return 0;
1debfc3dSmrg    }
1debfc3dSmrg}
1debfc3dSmrg
1debfc3dSmrg
1debfc3dSmrg/* This function implements the DYNAMIC scheduling method.  Arguments are
1debfc3dSmrg   as for gomp_iter_ull_static_next.  This function must be called with
1debfc3dSmrg   ws->lock held.  */
1debfc3dSmrg
1debfc3dSmrgbool
1debfc3dSmrggomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
1debfc3dSmrg{
1debfc3dSmrg  struct gomp_thread *thr = gomp_thread ();
1debfc3dSmrg  struct gomp_work_share *ws = thr->ts.work_share;
1debfc3dSmrg  gomp_ull start, end, chunk, left;
1debfc3dSmrg
1debfc3dSmrg  start = ws->next_ull;
1debfc3dSmrg  if (start == ws->end_ull)
1debfc3dSmrg    return false;
1debfc3dSmrg
1debfc3dSmrg  chunk = ws->chunk_size_ull;
1debfc3dSmrg  left = ws->end_ull - start;
1debfc3dSmrg  if (__builtin_expect (ws->mode & 2, 0))
1debfc3dSmrg    {
1debfc3dSmrg      if (chunk < left)
1debfc3dSmrg	chunk = left;
1debfc3dSmrg    }
1debfc3dSmrg  else
1debfc3dSmrg    {
1debfc3dSmrg      if (chunk > left)
1debfc3dSmrg	chunk = left;
1debfc3dSmrg    }
1debfc3dSmrg  end = start + chunk;
1debfc3dSmrg
1debfc3dSmrg  ws->next_ull = end;
1debfc3dSmrg  *pstart = start;
1debfc3dSmrg  *pend = end;
1debfc3dSmrg  return true;
1debfc3dSmrg}
1debfc3dSmrg
1debfc3dSmrg
1debfc3dSmrg#if defined HAVE_SYNC_BUILTINS && defined __LP64__
1debfc3dSmrg/* Similar, but doesn't require the lock held, and uses compare-and-swap
1debfc3dSmrg   instead.  Note that the only memory value that changes is ws->next_ull.  */
1debfc3dSmrg
1debfc3dSmrgbool
1debfc3dSmrggomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
1debfc3dSmrg{
1debfc3dSmrg  struct gomp_thread *thr = gomp_thread ();
1debfc3dSmrg  struct gomp_work_share *ws = thr->ts.work_share;
1debfc3dSmrg  gomp_ull start, end, nend, chunk;
1debfc3dSmrg
1debfc3dSmrg  end = ws->end_ull;
1debfc3dSmrg  chunk = ws->chunk_size_ull;
1debfc3dSmrg
1debfc3dSmrg  if (__builtin_expect (ws->mode & 1, 1))
1debfc3dSmrg    {
1debfc3dSmrg      gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
1debfc3dSmrg      if (__builtin_expect (ws->mode & 2, 0) == 0)
1debfc3dSmrg	{
1debfc3dSmrg	  if (tmp >= end)
1debfc3dSmrg	    return false;
1debfc3dSmrg	  nend = tmp + chunk;
1debfc3dSmrg	  if (nend > end)
1debfc3dSmrg	    nend = end;
1debfc3dSmrg	  *pstart = tmp;
1debfc3dSmrg	  *pend = nend;
1debfc3dSmrg	  return true;
1debfc3dSmrg	}
1debfc3dSmrg      else
1debfc3dSmrg	{
1debfc3dSmrg	  if (tmp <= end)
1debfc3dSmrg	    return false;
1debfc3dSmrg	  nend = tmp + chunk;
1debfc3dSmrg	  if (nend < end)
1debfc3dSmrg	    nend = end;
1debfc3dSmrg	  *pstart = tmp;
1debfc3dSmrg	  *pend = nend;
1debfc3dSmrg	  return true;
1debfc3dSmrg	}
1debfc3dSmrg    }
1debfc3dSmrg
1debfc3dSmrg  start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
1debfc3dSmrg  while (1)
1debfc3dSmrg    {
1debfc3dSmrg      gomp_ull left = end - start;
1debfc3dSmrg      gomp_ull tmp;
1debfc3dSmrg
1debfc3dSmrg      if (start == end)
1debfc3dSmrg	return false;
1debfc3dSmrg
1debfc3dSmrg      if (__builtin_expect (ws->mode & 2, 0))
1debfc3dSmrg	{
1debfc3dSmrg	  if (chunk < left)
1debfc3dSmrg	    chunk = left;
1debfc3dSmrg	}
1debfc3dSmrg      else
1debfc3dSmrg	{
1debfc3dSmrg	  if (chunk > left)
1debfc3dSmrg	    chunk = left;
1debfc3dSmrg	}
1debfc3dSmrg      nend = start + chunk;
1debfc3dSmrg
1debfc3dSmrg      tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
1debfc3dSmrg      if (__builtin_expect (tmp == start, 1))
1debfc3dSmrg	break;
1debfc3dSmrg
1debfc3dSmrg      start = tmp;
1debfc3dSmrg    }
1debfc3dSmrg
1debfc3dSmrg  *pstart = start;
1debfc3dSmrg  *pend = nend;
1debfc3dSmrg  return true;
1debfc3dSmrg}
1debfc3dSmrg#endif /* HAVE_SYNC_BUILTINS */
1debfc3dSmrg
1debfc3dSmrg
1debfc3dSmrg/* This function implements the GUIDED scheduling method.  Arguments are
1debfc3dSmrg   as for gomp_iter_ull_static_next.  This function must be called with the
1debfc3dSmrg   work share lock held.  */
1debfc3dSmrg
1debfc3dSmrgbool
1debfc3dSmrggomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
1debfc3dSmrg{
1debfc3dSmrg  struct gomp_thread *thr = gomp_thread ();
1debfc3dSmrg  struct gomp_work_share *ws = thr->ts.work_share;
1debfc3dSmrg  struct gomp_team *team = thr->ts.team;
1debfc3dSmrg  gomp_ull nthreads = team ? team->nthreads : 1;
1debfc3dSmrg  gomp_ull n, q;
1debfc3dSmrg  gomp_ull start, end;
1debfc3dSmrg
1debfc3dSmrg  if (ws->next_ull == ws->end_ull)
1debfc3dSmrg    return false;
1debfc3dSmrg
1debfc3dSmrg  start = ws->next_ull;
1debfc3dSmrg  if (__builtin_expect (ws->mode, 0) == 0)
1debfc3dSmrg    n = (ws->end_ull - start) / ws->incr_ull;
1debfc3dSmrg  else
1debfc3dSmrg    n = (start - ws->end_ull) / -ws->incr_ull;
1debfc3dSmrg  q = (n + nthreads - 1) / nthreads;
1debfc3dSmrg
1debfc3dSmrg  if (q < ws->chunk_size_ull)
1debfc3dSmrg    q = ws->chunk_size_ull;
1debfc3dSmrg  if (q <= n)
1debfc3dSmrg    end = start + q * ws->incr_ull;
1debfc3dSmrg  else
1debfc3dSmrg    end = ws->end_ull;
1debfc3dSmrg
1debfc3dSmrg  ws->next_ull = end;
1debfc3dSmrg  *pstart = start;
1debfc3dSmrg  *pend = end;
1debfc3dSmrg  return true;
1debfc3dSmrg}
1debfc3dSmrg
1debfc3dSmrg#if defined HAVE_SYNC_BUILTINS && defined __LP64__
1debfc3dSmrg/* Similar, but doesn't require the lock held, and uses compare-and-swap
1debfc3dSmrg   instead.  Note that the only memory value that changes is ws->next_ull.  */
1debfc3dSmrg
1debfc3dSmrgbool
1debfc3dSmrggomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
1debfc3dSmrg{
1debfc3dSmrg  struct gomp_thread *thr = gomp_thread ();
1debfc3dSmrg  struct gomp_work_share *ws = thr->ts.work_share;
1debfc3dSmrg  struct gomp_team *team = thr->ts.team;
1debfc3dSmrg  gomp_ull nthreads = team ? team->nthreads : 1;
1debfc3dSmrg  gomp_ull start, end, nend, incr;
1debfc3dSmrg  gomp_ull chunk_size;
1debfc3dSmrg
1debfc3dSmrg  start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
1debfc3dSmrg  end = ws->end_ull;
1debfc3dSmrg  incr = ws->incr_ull;
1debfc3dSmrg  chunk_size = ws->chunk_size_ull;
1debfc3dSmrg
1debfc3dSmrg  while (1)
1debfc3dSmrg    {
1debfc3dSmrg      gomp_ull n, q;
1debfc3dSmrg      gomp_ull tmp;
1debfc3dSmrg
1debfc3dSmrg      if (start == end)
1debfc3dSmrg	return false;
1debfc3dSmrg
1debfc3dSmrg      if (__builtin_expect (ws->mode, 0) == 0)
1debfc3dSmrg	n = (end - start) / incr;
1debfc3dSmrg      else
1debfc3dSmrg	n = (start - end) / -incr;
1debfc3dSmrg      q = (n + nthreads - 1) / nthreads;
1debfc3dSmrg
1debfc3dSmrg      if (q < chunk_size)
1debfc3dSmrg	q = chunk_size;
1debfc3dSmrg      if (__builtin_expect (q <= n, 1))
1debfc3dSmrg	nend = start + q * incr;
1debfc3dSmrg      else
1debfc3dSmrg	nend = end;
1debfc3dSmrg
1debfc3dSmrg      tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
1debfc3dSmrg      if (__builtin_expect (tmp == start, 1))
1debfc3dSmrg	break;
1debfc3dSmrg
1debfc3dSmrg      start = tmp;
1debfc3dSmrg    }
1debfc3dSmrg
1debfc3dSmrg  *pstart = start;
1debfc3dSmrg  *pend = nend;
1debfc3dSmrg  return true;
1debfc3dSmrg}
1debfc3dSmrg#endif /* HAVE_SYNC_BUILTINS */