dist/libgomp/iter_ull.c

*b1e83836Smrg/* Copyright (C) 2005-2022 Free Software Foundation, Inc.
4fee23f9Smrg   Contributed by Richard Henderson <rth@redhat.com>.
4fee23f9Smrg
4d5abbe8Smrg   This file is part of the GNU Offloading and Multi Processing Library
4d5abbe8Smrg   (libgomp).
4fee23f9Smrg
4fee23f9Smrg   Libgomp is free software; you can redistribute it and/or modify it
4fee23f9Smrg   under the terms of the GNU General Public License as published by
4fee23f9Smrg   the Free Software Foundation; either version 3, or (at your option)
4fee23f9Smrg   any later version.
4fee23f9Smrg
4fee23f9Smrg   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
4fee23f9Smrg   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
4fee23f9Smrg   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
4fee23f9Smrg   more details.
4fee23f9Smrg
4fee23f9Smrg   Under Section 7 of GPL version 3, you are granted additional
4fee23f9Smrg   permissions described in the GCC Runtime Library Exception, version
4fee23f9Smrg   3.1, as published by the Free Software Foundation.
4fee23f9Smrg
4fee23f9Smrg   You should have received a copy of the GNU General Public License and
4fee23f9Smrg   a copy of the GCC Runtime Library Exception along with this program;
4fee23f9Smrg   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
4fee23f9Smrg   <http://www.gnu.org/licenses/>.  */
4fee23f9Smrg
4fee23f9Smrg/* This file contains routines for managing work-share iteration, both
4fee23f9Smrg   for loops and sections.  */
4fee23f9Smrg
4fee23f9Smrg#include "libgomp.h"
4fee23f9Smrg#include <stdlib.h>
4fee23f9Smrg
4fee23f9Smrgtypedef unsigned long long gomp_ull;
4fee23f9Smrg
4fee23f9Smrg/* This function implements the STATIC scheduling method.  The caller should
4fee23f9Smrg   iterate *pstart <= x < *pend.  Return zero if there are more iterations
4fee23f9Smrg   to perform; nonzero if not.  Return less than 0 if this thread had
4fee23f9Smrg   received the absolutely last iteration.  */
4fee23f9Smrg
4fee23f9Smrgint
4fee23f9Smrggomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
4fee23f9Smrg{
4fee23f9Smrg  struct gomp_thread *thr = gomp_thread ();
4fee23f9Smrg  struct gomp_team *team = thr->ts.team;
4fee23f9Smrg  struct gomp_work_share *ws = thr->ts.work_share;
4fee23f9Smrg  unsigned long nthreads = team ? team->nthreads : 1;
4fee23f9Smrg
4fee23f9Smrg  if (thr->ts.static_trip == -1)
4fee23f9Smrg    return -1;
4fee23f9Smrg
4fee23f9Smrg  /* Quick test for degenerate teams and orphaned constructs.  */
4fee23f9Smrg  if (nthreads == 1)
4fee23f9Smrg    {
4fee23f9Smrg      *pstart = ws->next_ull;
4fee23f9Smrg      *pend = ws->end_ull;
4fee23f9Smrg      thr->ts.static_trip = -1;
4fee23f9Smrg      return ws->next_ull == ws->end_ull;
4fee23f9Smrg    }
4fee23f9Smrg
4fee23f9Smrg  /* We interpret chunk_size zero as "unspecified", which means that we
4fee23f9Smrg     should break up the iterations such that each thread makes only one
4fee23f9Smrg     trip through the outer loop.  */
4fee23f9Smrg  if (ws->chunk_size_ull == 0)
4fee23f9Smrg    {
48fb7bfaSmrg      gomp_ull n, q, i, t, s0, e0, s, e;
4fee23f9Smrg
4fee23f9Smrg      if (thr->ts.static_trip > 0)
4fee23f9Smrg	return 1;
4fee23f9Smrg
4fee23f9Smrg      /* Compute the total number of iterations.  */
4fee23f9Smrg      if (__builtin_expect (ws->mode, 0) == 0)
4fee23f9Smrg	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
4fee23f9Smrg      else
4fee23f9Smrg	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
4fee23f9Smrg      i = thr->ts.team_id;
4fee23f9Smrg
4fee23f9Smrg      /* Compute the "zero-based" start and end points.  That is, as
4fee23f9Smrg	 if the loop began at zero and incremented by one.  */
4fee23f9Smrg      q = n / nthreads;
48fb7bfaSmrg      t = n % nthreads;
48fb7bfaSmrg      if (i < t)
48fb7bfaSmrg	{
48fb7bfaSmrg	  t = 0;
48fb7bfaSmrg	  q++;
48fb7bfaSmrg	}
48fb7bfaSmrg      s0 = q * i + t;
4fee23f9Smrg      e0 = s0 + q;
4fee23f9Smrg
4fee23f9Smrg      /* Notice when no iterations allocated for this thread.  */
4fee23f9Smrg      if (s0 >= e0)
4fee23f9Smrg	{
4fee23f9Smrg	  thr->ts.static_trip = 1;
4fee23f9Smrg	  return 1;
4fee23f9Smrg	}
4fee23f9Smrg
4fee23f9Smrg      /* Transform these to the actual start and end numbers.  */
4fee23f9Smrg      s = s0 * ws->incr_ull + ws->next_ull;
4fee23f9Smrg      e = e0 * ws->incr_ull + ws->next_ull;
4fee23f9Smrg
4fee23f9Smrg      *pstart = s;
4fee23f9Smrg      *pend = e;
4fee23f9Smrg      thr->ts.static_trip = (e0 == n ? -1 : 1);
4fee23f9Smrg      return 0;
4fee23f9Smrg    }
4fee23f9Smrg  else
4fee23f9Smrg    {
4fee23f9Smrg      gomp_ull n, s0, e0, i, c, s, e;
4fee23f9Smrg
4fee23f9Smrg      /* Otherwise, each thread gets exactly chunk_size iterations
4fee23f9Smrg	 (if available) each time through the loop.  */
4fee23f9Smrg
4fee23f9Smrg      if (__builtin_expect (ws->mode, 0) == 0)
4fee23f9Smrg	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
4fee23f9Smrg      else
4fee23f9Smrg	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
4fee23f9Smrg      i = thr->ts.team_id;
4fee23f9Smrg      c = ws->chunk_size_ull;
4fee23f9Smrg
4fee23f9Smrg      /* Initial guess is a C sized chunk positioned nthreads iterations
4fee23f9Smrg	 in, offset by our thread number.  */
4fee23f9Smrg      s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
4fee23f9Smrg      e0 = s0 + c;
4fee23f9Smrg
4fee23f9Smrg      /* Detect overflow.  */
4fee23f9Smrg      if (s0 >= n)
4fee23f9Smrg	return 1;
4fee23f9Smrg      if (e0 > n)
4fee23f9Smrg	e0 = n;
4fee23f9Smrg
4fee23f9Smrg      /* Transform these to the actual start and end numbers.  */
4fee23f9Smrg      s = s0 * ws->incr_ull + ws->next_ull;
4fee23f9Smrg      e = e0 * ws->incr_ull + ws->next_ull;
4fee23f9Smrg
4fee23f9Smrg      *pstart = s;
4fee23f9Smrg      *pend = e;
4fee23f9Smrg
4fee23f9Smrg      if (e0 == n)
4fee23f9Smrg	thr->ts.static_trip = -1;
4fee23f9Smrg      else
4fee23f9Smrg	thr->ts.static_trip++;
4fee23f9Smrg      return 0;
4fee23f9Smrg    }
4fee23f9Smrg}
4fee23f9Smrg
4fee23f9Smrg
4fee23f9Smrg/* This function implements the DYNAMIC scheduling method.  Arguments are
4fee23f9Smrg   as for gomp_iter_ull_static_next.  This function must be called with
4fee23f9Smrg   ws->lock held.  */
4fee23f9Smrg
4fee23f9Smrgbool
4fee23f9Smrggomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
4fee23f9Smrg{
4fee23f9Smrg  struct gomp_thread *thr = gomp_thread ();
4fee23f9Smrg  struct gomp_work_share *ws = thr->ts.work_share;
4fee23f9Smrg  gomp_ull start, end, chunk, left;
4fee23f9Smrg
4fee23f9Smrg  start = ws->next_ull;
4fee23f9Smrg  if (start == ws->end_ull)
4fee23f9Smrg    return false;
4fee23f9Smrg
4fee23f9Smrg  chunk = ws->chunk_size_ull;
4fee23f9Smrg  left = ws->end_ull - start;
4fee23f9Smrg  if (__builtin_expect (ws->mode & 2, 0))
4fee23f9Smrg    {
4fee23f9Smrg      if (chunk < left)
4fee23f9Smrg	chunk = left;
4fee23f9Smrg    }
4fee23f9Smrg  else
4fee23f9Smrg    {
4fee23f9Smrg      if (chunk > left)
4fee23f9Smrg	chunk = left;
4fee23f9Smrg    }
4fee23f9Smrg  end = start + chunk;
4fee23f9Smrg
4fee23f9Smrg  ws->next_ull = end;
4fee23f9Smrg  *pstart = start;
4fee23f9Smrg  *pend = end;
4fee23f9Smrg  return true;
4fee23f9Smrg}
4fee23f9Smrg
4fee23f9Smrg
4fee23f9Smrg#if defined HAVE_SYNC_BUILTINS && defined __LP64__
4fee23f9Smrg/* Similar, but doesn't require the lock held, and uses compare-and-swap
4fee23f9Smrg   instead.  Note that the only memory value that changes is ws->next_ull.  */
4fee23f9Smrg
4fee23f9Smrgbool
4fee23f9Smrggomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
4fee23f9Smrg{
4fee23f9Smrg  struct gomp_thread *thr = gomp_thread ();
4fee23f9Smrg  struct gomp_work_share *ws = thr->ts.work_share;
4fee23f9Smrg  gomp_ull start, end, nend, chunk;
4fee23f9Smrg
4fee23f9Smrg  end = ws->end_ull;
4fee23f9Smrg  chunk = ws->chunk_size_ull;
4fee23f9Smrg
4fee23f9Smrg  if (__builtin_expect (ws->mode & 1, 1))
4fee23f9Smrg    {
4fee23f9Smrg      gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
4fee23f9Smrg      if (__builtin_expect (ws->mode & 2, 0) == 0)
4fee23f9Smrg	{
4fee23f9Smrg	  if (tmp >= end)
4fee23f9Smrg	    return false;
4fee23f9Smrg	  nend = tmp + chunk;
4fee23f9Smrg	  if (nend > end)
4fee23f9Smrg	    nend = end;
4fee23f9Smrg	  *pstart = tmp;
4fee23f9Smrg	  *pend = nend;
4fee23f9Smrg	  return true;
4fee23f9Smrg	}
4fee23f9Smrg      else
4fee23f9Smrg	{
4fee23f9Smrg	  if (tmp <= end)
4fee23f9Smrg	    return false;
4fee23f9Smrg	  nend = tmp + chunk;
4fee23f9Smrg	  if (nend < end)
4fee23f9Smrg	    nend = end;
4fee23f9Smrg	  *pstart = tmp;
4fee23f9Smrg	  *pend = nend;
4fee23f9Smrg	  return true;
4fee23f9Smrg	}
4fee23f9Smrg    }
4fee23f9Smrg
4d5abbe8Smrg  start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
4fee23f9Smrg  while (1)
4fee23f9Smrg    {
4fee23f9Smrg      gomp_ull left = end - start;
4fee23f9Smrg      gomp_ull tmp;
4fee23f9Smrg
4fee23f9Smrg      if (start == end)
4fee23f9Smrg	return false;
4fee23f9Smrg
4fee23f9Smrg      if (__builtin_expect (ws->mode & 2, 0))
4fee23f9Smrg	{
4fee23f9Smrg	  if (chunk < left)
4fee23f9Smrg	    chunk = left;
4fee23f9Smrg	}
4fee23f9Smrg      else
4fee23f9Smrg	{
4fee23f9Smrg	  if (chunk > left)
4fee23f9Smrg	    chunk = left;
4fee23f9Smrg	}
4fee23f9Smrg      nend = start + chunk;
4fee23f9Smrg
4fee23f9Smrg      tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
4fee23f9Smrg      if (__builtin_expect (tmp == start, 1))
4fee23f9Smrg	break;
4fee23f9Smrg
4fee23f9Smrg      start = tmp;
4fee23f9Smrg    }
4fee23f9Smrg
4fee23f9Smrg  *pstart = start;
4fee23f9Smrg  *pend = nend;
4fee23f9Smrg  return true;
4fee23f9Smrg}
4fee23f9Smrg#endif /* HAVE_SYNC_BUILTINS */
4fee23f9Smrg
4fee23f9Smrg
4fee23f9Smrg/* This function implements the GUIDED scheduling method.  Arguments are
4fee23f9Smrg   as for gomp_iter_ull_static_next.  This function must be called with the
4fee23f9Smrg   work share lock held.  */
4fee23f9Smrg
4fee23f9Smrgbool
4fee23f9Smrggomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
4fee23f9Smrg{
4fee23f9Smrg  struct gomp_thread *thr = gomp_thread ();
4fee23f9Smrg  struct gomp_work_share *ws = thr->ts.work_share;
4fee23f9Smrg  struct gomp_team *team = thr->ts.team;
4fee23f9Smrg  gomp_ull nthreads = team ? team->nthreads : 1;
4fee23f9Smrg  gomp_ull n, q;
4fee23f9Smrg  gomp_ull start, end;
4fee23f9Smrg
4fee23f9Smrg  if (ws->next_ull == ws->end_ull)
4fee23f9Smrg    return false;
4fee23f9Smrg
4fee23f9Smrg  start = ws->next_ull;
4fee23f9Smrg  if (__builtin_expect (ws->mode, 0) == 0)
4fee23f9Smrg    n = (ws->end_ull - start) / ws->incr_ull;
4fee23f9Smrg  else
4fee23f9Smrg    n = (start - ws->end_ull) / -ws->incr_ull;
4fee23f9Smrg  q = (n + nthreads - 1) / nthreads;
4fee23f9Smrg
4fee23f9Smrg  if (q < ws->chunk_size_ull)
4fee23f9Smrg    q = ws->chunk_size_ull;
4fee23f9Smrg  if (q <= n)
4fee23f9Smrg    end = start + q * ws->incr_ull;
4fee23f9Smrg  else
4fee23f9Smrg    end = ws->end_ull;
4fee23f9Smrg
4fee23f9Smrg  ws->next_ull = end;
4fee23f9Smrg  *pstart = start;
4fee23f9Smrg  *pend = end;
4fee23f9Smrg  return true;
4fee23f9Smrg}
4fee23f9Smrg
4fee23f9Smrg#if defined HAVE_SYNC_BUILTINS && defined __LP64__
4fee23f9Smrg/* Similar, but doesn't require the lock held, and uses compare-and-swap
4fee23f9Smrg   instead.  Note that the only memory value that changes is ws->next_ull.  */
4fee23f9Smrg
4fee23f9Smrgbool
4fee23f9Smrggomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
4fee23f9Smrg{
4fee23f9Smrg  struct gomp_thread *thr = gomp_thread ();
4fee23f9Smrg  struct gomp_work_share *ws = thr->ts.work_share;
4fee23f9Smrg  struct gomp_team *team = thr->ts.team;
4fee23f9Smrg  gomp_ull nthreads = team ? team->nthreads : 1;
4fee23f9Smrg  gomp_ull start, end, nend, incr;
4fee23f9Smrg  gomp_ull chunk_size;
4fee23f9Smrg
4d5abbe8Smrg  start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
4fee23f9Smrg  end = ws->end_ull;
4fee23f9Smrg  incr = ws->incr_ull;
4fee23f9Smrg  chunk_size = ws->chunk_size_ull;
4fee23f9Smrg
4fee23f9Smrg  while (1)
4fee23f9Smrg    {
4fee23f9Smrg      gomp_ull n, q;
4fee23f9Smrg      gomp_ull tmp;
4fee23f9Smrg
4fee23f9Smrg      if (start == end)
4fee23f9Smrg	return false;
4fee23f9Smrg
4fee23f9Smrg      if (__builtin_expect (ws->mode, 0) == 0)
4fee23f9Smrg	n = (end - start) / incr;
4fee23f9Smrg      else
4fee23f9Smrg	n = (start - end) / -incr;
4fee23f9Smrg      q = (n + nthreads - 1) / nthreads;
4fee23f9Smrg
4fee23f9Smrg      if (q < chunk_size)
4fee23f9Smrg	q = chunk_size;
4fee23f9Smrg      if (__builtin_expect (q <= n, 1))
4fee23f9Smrg	nend = start + q * incr;
4fee23f9Smrg      else
4fee23f9Smrg	nend = end;
4fee23f9Smrg
4fee23f9Smrg      tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
4fee23f9Smrg      if (__builtin_expect (tmp == start, 1))
4fee23f9Smrg	break;
4fee23f9Smrg
4fee23f9Smrg      start = tmp;
4fee23f9Smrg    }
4fee23f9Smrg
4fee23f9Smrg  *pstart = start;
4fee23f9Smrg  *pend = nend;
4fee23f9Smrg  return true;
4fee23f9Smrg}
4fee23f9Smrg#endif /* HAVE_SYNC_BUILTINS */