xref: /netbsd-src/external/gpl3/gcc.old/dist/libgomp/iter_ull.c (revision 8feb0f0b7eaff0608f8350bbfa3098827b4bb91b)
1*8feb0f0bSmrg /* Copyright (C) 2005-2020 Free Software Foundation, Inc.
21debfc3dSmrg    Contributed by Richard Henderson <rth@redhat.com>.
31debfc3dSmrg 
41debfc3dSmrg    This file is part of the GNU Offloading and Multi Processing Library
51debfc3dSmrg    (libgomp).
61debfc3dSmrg 
71debfc3dSmrg    Libgomp is free software; you can redistribute it and/or modify it
81debfc3dSmrg    under the terms of the GNU General Public License as published by
91debfc3dSmrg    the Free Software Foundation; either version 3, or (at your option)
101debfc3dSmrg    any later version.
111debfc3dSmrg 
121debfc3dSmrg    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
131debfc3dSmrg    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
141debfc3dSmrg    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
151debfc3dSmrg    more details.
161debfc3dSmrg 
171debfc3dSmrg    Under Section 7 of GPL version 3, you are granted additional
181debfc3dSmrg    permissions described in the GCC Runtime Library Exception, version
191debfc3dSmrg    3.1, as published by the Free Software Foundation.
201debfc3dSmrg 
211debfc3dSmrg    You should have received a copy of the GNU General Public License and
221debfc3dSmrg    a copy of the GCC Runtime Library Exception along with this program;
231debfc3dSmrg    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
241debfc3dSmrg    <http://www.gnu.org/licenses/>.  */
251debfc3dSmrg 
261debfc3dSmrg /* This file contains routines for managing work-share iteration, both
271debfc3dSmrg    for loops and sections.  */
281debfc3dSmrg 
291debfc3dSmrg #include "libgomp.h"
301debfc3dSmrg #include <stdlib.h>
311debfc3dSmrg 
321debfc3dSmrg typedef unsigned long long gomp_ull;
331debfc3dSmrg 
341debfc3dSmrg /* This function implements the STATIC scheduling method.  The caller should
351debfc3dSmrg    iterate *pstart <= x < *pend.  Return zero if there are more iterations
361debfc3dSmrg    to perform; nonzero if not.  Return less than 0 if this thread had
371debfc3dSmrg    received the absolutely last iteration.  */
381debfc3dSmrg 
391debfc3dSmrg int
gomp_iter_ull_static_next(gomp_ull * pstart,gomp_ull * pend)401debfc3dSmrg gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
411debfc3dSmrg {
421debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
431debfc3dSmrg   struct gomp_team *team = thr->ts.team;
441debfc3dSmrg   struct gomp_work_share *ws = thr->ts.work_share;
451debfc3dSmrg   unsigned long nthreads = team ? team->nthreads : 1;
461debfc3dSmrg 
471debfc3dSmrg   if (thr->ts.static_trip == -1)
481debfc3dSmrg     return -1;
491debfc3dSmrg 
501debfc3dSmrg   /* Quick test for degenerate teams and orphaned constructs.  */
511debfc3dSmrg   if (nthreads == 1)
521debfc3dSmrg     {
531debfc3dSmrg       *pstart = ws->next_ull;
541debfc3dSmrg       *pend = ws->end_ull;
551debfc3dSmrg       thr->ts.static_trip = -1;
561debfc3dSmrg       return ws->next_ull == ws->end_ull;
571debfc3dSmrg     }
581debfc3dSmrg 
591debfc3dSmrg   /* We interpret chunk_size zero as "unspecified", which means that we
601debfc3dSmrg      should break up the iterations such that each thread makes only one
611debfc3dSmrg      trip through the outer loop.  */
621debfc3dSmrg   if (ws->chunk_size_ull == 0)
631debfc3dSmrg     {
641debfc3dSmrg       gomp_ull n, q, i, t, s0, e0, s, e;
651debfc3dSmrg 
661debfc3dSmrg       if (thr->ts.static_trip > 0)
671debfc3dSmrg 	return 1;
681debfc3dSmrg 
691debfc3dSmrg       /* Compute the total number of iterations.  */
701debfc3dSmrg       if (__builtin_expect (ws->mode, 0) == 0)
711debfc3dSmrg 	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
721debfc3dSmrg       else
731debfc3dSmrg 	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
741debfc3dSmrg       i = thr->ts.team_id;
751debfc3dSmrg 
761debfc3dSmrg       /* Compute the "zero-based" start and end points.  That is, as
771debfc3dSmrg 	 if the loop began at zero and incremented by one.  */
781debfc3dSmrg       q = n / nthreads;
791debfc3dSmrg       t = n % nthreads;
801debfc3dSmrg       if (i < t)
811debfc3dSmrg 	{
821debfc3dSmrg 	  t = 0;
831debfc3dSmrg 	  q++;
841debfc3dSmrg 	}
851debfc3dSmrg       s0 = q * i + t;
861debfc3dSmrg       e0 = s0 + q;
871debfc3dSmrg 
881debfc3dSmrg       /* Notice when no iterations allocated for this thread.  */
891debfc3dSmrg       if (s0 >= e0)
901debfc3dSmrg 	{
911debfc3dSmrg 	  thr->ts.static_trip = 1;
921debfc3dSmrg 	  return 1;
931debfc3dSmrg 	}
941debfc3dSmrg 
951debfc3dSmrg       /* Transform these to the actual start and end numbers.  */
961debfc3dSmrg       s = s0 * ws->incr_ull + ws->next_ull;
971debfc3dSmrg       e = e0 * ws->incr_ull + ws->next_ull;
981debfc3dSmrg 
991debfc3dSmrg       *pstart = s;
1001debfc3dSmrg       *pend = e;
1011debfc3dSmrg       thr->ts.static_trip = (e0 == n ? -1 : 1);
1021debfc3dSmrg       return 0;
1031debfc3dSmrg     }
1041debfc3dSmrg   else
1051debfc3dSmrg     {
1061debfc3dSmrg       gomp_ull n, s0, e0, i, c, s, e;
1071debfc3dSmrg 
1081debfc3dSmrg       /* Otherwise, each thread gets exactly chunk_size iterations
1091debfc3dSmrg 	 (if available) each time through the loop.  */
1101debfc3dSmrg 
1111debfc3dSmrg       if (__builtin_expect (ws->mode, 0) == 0)
1121debfc3dSmrg 	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
1131debfc3dSmrg       else
1141debfc3dSmrg 	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
1151debfc3dSmrg       i = thr->ts.team_id;
1161debfc3dSmrg       c = ws->chunk_size_ull;
1171debfc3dSmrg 
1181debfc3dSmrg       /* Initial guess is a C sized chunk positioned nthreads iterations
1191debfc3dSmrg 	 in, offset by our thread number.  */
1201debfc3dSmrg       s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
1211debfc3dSmrg       e0 = s0 + c;
1221debfc3dSmrg 
1231debfc3dSmrg       /* Detect overflow.  */
1241debfc3dSmrg       if (s0 >= n)
1251debfc3dSmrg 	return 1;
1261debfc3dSmrg       if (e0 > n)
1271debfc3dSmrg 	e0 = n;
1281debfc3dSmrg 
1291debfc3dSmrg       /* Transform these to the actual start and end numbers.  */
1301debfc3dSmrg       s = s0 * ws->incr_ull + ws->next_ull;
1311debfc3dSmrg       e = e0 * ws->incr_ull + ws->next_ull;
1321debfc3dSmrg 
1331debfc3dSmrg       *pstart = s;
1341debfc3dSmrg       *pend = e;
1351debfc3dSmrg 
1361debfc3dSmrg       if (e0 == n)
1371debfc3dSmrg 	thr->ts.static_trip = -1;
1381debfc3dSmrg       else
1391debfc3dSmrg 	thr->ts.static_trip++;
1401debfc3dSmrg       return 0;
1411debfc3dSmrg     }
1421debfc3dSmrg }
1431debfc3dSmrg 
1441debfc3dSmrg 
1451debfc3dSmrg /* This function implements the DYNAMIC scheduling method.  Arguments are
1461debfc3dSmrg    as for gomp_iter_ull_static_next.  This function must be called with
1471debfc3dSmrg    ws->lock held.  */
1481debfc3dSmrg 
1491debfc3dSmrg bool
gomp_iter_ull_dynamic_next_locked(gomp_ull * pstart,gomp_ull * pend)1501debfc3dSmrg gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
1511debfc3dSmrg {
1521debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
1531debfc3dSmrg   struct gomp_work_share *ws = thr->ts.work_share;
1541debfc3dSmrg   gomp_ull start, end, chunk, left;
1551debfc3dSmrg 
1561debfc3dSmrg   start = ws->next_ull;
1571debfc3dSmrg   if (start == ws->end_ull)
1581debfc3dSmrg     return false;
1591debfc3dSmrg 
1601debfc3dSmrg   chunk = ws->chunk_size_ull;
1611debfc3dSmrg   left = ws->end_ull - start;
1621debfc3dSmrg   if (__builtin_expect (ws->mode & 2, 0))
1631debfc3dSmrg     {
1641debfc3dSmrg       if (chunk < left)
1651debfc3dSmrg 	chunk = left;
1661debfc3dSmrg     }
1671debfc3dSmrg   else
1681debfc3dSmrg     {
1691debfc3dSmrg       if (chunk > left)
1701debfc3dSmrg 	chunk = left;
1711debfc3dSmrg     }
1721debfc3dSmrg   end = start + chunk;
1731debfc3dSmrg 
1741debfc3dSmrg   ws->next_ull = end;
1751debfc3dSmrg   *pstart = start;
1761debfc3dSmrg   *pend = end;
1771debfc3dSmrg   return true;
1781debfc3dSmrg }
1791debfc3dSmrg 
1801debfc3dSmrg 
1811debfc3dSmrg #if defined HAVE_SYNC_BUILTINS && defined __LP64__
1821debfc3dSmrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
1831debfc3dSmrg    instead.  Note that the only memory value that changes is ws->next_ull.  */
1841debfc3dSmrg 
1851debfc3dSmrg bool
gomp_iter_ull_dynamic_next(gomp_ull * pstart,gomp_ull * pend)1861debfc3dSmrg gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
1871debfc3dSmrg {
1881debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
1891debfc3dSmrg   struct gomp_work_share *ws = thr->ts.work_share;
1901debfc3dSmrg   gomp_ull start, end, nend, chunk;
1911debfc3dSmrg 
1921debfc3dSmrg   end = ws->end_ull;
1931debfc3dSmrg   chunk = ws->chunk_size_ull;
1941debfc3dSmrg 
1951debfc3dSmrg   if (__builtin_expect (ws->mode & 1, 1))
1961debfc3dSmrg     {
1971debfc3dSmrg       gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
1981debfc3dSmrg       if (__builtin_expect (ws->mode & 2, 0) == 0)
1991debfc3dSmrg 	{
2001debfc3dSmrg 	  if (tmp >= end)
2011debfc3dSmrg 	    return false;
2021debfc3dSmrg 	  nend = tmp + chunk;
2031debfc3dSmrg 	  if (nend > end)
2041debfc3dSmrg 	    nend = end;
2051debfc3dSmrg 	  *pstart = tmp;
2061debfc3dSmrg 	  *pend = nend;
2071debfc3dSmrg 	  return true;
2081debfc3dSmrg 	}
2091debfc3dSmrg       else
2101debfc3dSmrg 	{
2111debfc3dSmrg 	  if (tmp <= end)
2121debfc3dSmrg 	    return false;
2131debfc3dSmrg 	  nend = tmp + chunk;
2141debfc3dSmrg 	  if (nend < end)
2151debfc3dSmrg 	    nend = end;
2161debfc3dSmrg 	  *pstart = tmp;
2171debfc3dSmrg 	  *pend = nend;
2181debfc3dSmrg 	  return true;
2191debfc3dSmrg 	}
2201debfc3dSmrg     }
2211debfc3dSmrg 
2221debfc3dSmrg   start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
2231debfc3dSmrg   while (1)
2241debfc3dSmrg     {
2251debfc3dSmrg       gomp_ull left = end - start;
2261debfc3dSmrg       gomp_ull tmp;
2271debfc3dSmrg 
2281debfc3dSmrg       if (start == end)
2291debfc3dSmrg 	return false;
2301debfc3dSmrg 
2311debfc3dSmrg       if (__builtin_expect (ws->mode & 2, 0))
2321debfc3dSmrg 	{
2331debfc3dSmrg 	  if (chunk < left)
2341debfc3dSmrg 	    chunk = left;
2351debfc3dSmrg 	}
2361debfc3dSmrg       else
2371debfc3dSmrg 	{
2381debfc3dSmrg 	  if (chunk > left)
2391debfc3dSmrg 	    chunk = left;
2401debfc3dSmrg 	}
2411debfc3dSmrg       nend = start + chunk;
2421debfc3dSmrg 
2431debfc3dSmrg       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
2441debfc3dSmrg       if (__builtin_expect (tmp == start, 1))
2451debfc3dSmrg 	break;
2461debfc3dSmrg 
2471debfc3dSmrg       start = tmp;
2481debfc3dSmrg     }
2491debfc3dSmrg 
2501debfc3dSmrg   *pstart = start;
2511debfc3dSmrg   *pend = nend;
2521debfc3dSmrg   return true;
2531debfc3dSmrg }
2541debfc3dSmrg #endif /* HAVE_SYNC_BUILTINS */
2551debfc3dSmrg 
2561debfc3dSmrg 
2571debfc3dSmrg /* This function implements the GUIDED scheduling method.  Arguments are
2581debfc3dSmrg    as for gomp_iter_ull_static_next.  This function must be called with the
2591debfc3dSmrg    work share lock held.  */
2601debfc3dSmrg 
2611debfc3dSmrg bool
gomp_iter_ull_guided_next_locked(gomp_ull * pstart,gomp_ull * pend)2621debfc3dSmrg gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
2631debfc3dSmrg {
2641debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
2651debfc3dSmrg   struct gomp_work_share *ws = thr->ts.work_share;
2661debfc3dSmrg   struct gomp_team *team = thr->ts.team;
2671debfc3dSmrg   gomp_ull nthreads = team ? team->nthreads : 1;
2681debfc3dSmrg   gomp_ull n, q;
2691debfc3dSmrg   gomp_ull start, end;
2701debfc3dSmrg 
2711debfc3dSmrg   if (ws->next_ull == ws->end_ull)
2721debfc3dSmrg     return false;
2731debfc3dSmrg 
2741debfc3dSmrg   start = ws->next_ull;
2751debfc3dSmrg   if (__builtin_expect (ws->mode, 0) == 0)
2761debfc3dSmrg     n = (ws->end_ull - start) / ws->incr_ull;
2771debfc3dSmrg   else
2781debfc3dSmrg     n = (start - ws->end_ull) / -ws->incr_ull;
2791debfc3dSmrg   q = (n + nthreads - 1) / nthreads;
2801debfc3dSmrg 
2811debfc3dSmrg   if (q < ws->chunk_size_ull)
2821debfc3dSmrg     q = ws->chunk_size_ull;
2831debfc3dSmrg   if (q <= n)
2841debfc3dSmrg     end = start + q * ws->incr_ull;
2851debfc3dSmrg   else
2861debfc3dSmrg     end = ws->end_ull;
2871debfc3dSmrg 
2881debfc3dSmrg   ws->next_ull = end;
2891debfc3dSmrg   *pstart = start;
2901debfc3dSmrg   *pend = end;
2911debfc3dSmrg   return true;
2921debfc3dSmrg }
2931debfc3dSmrg 
2941debfc3dSmrg #if defined HAVE_SYNC_BUILTINS && defined __LP64__
2951debfc3dSmrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
2961debfc3dSmrg    instead.  Note that the only memory value that changes is ws->next_ull.  */
2971debfc3dSmrg 
2981debfc3dSmrg bool
gomp_iter_ull_guided_next(gomp_ull * pstart,gomp_ull * pend)2991debfc3dSmrg gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
3001debfc3dSmrg {
3011debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
3021debfc3dSmrg   struct gomp_work_share *ws = thr->ts.work_share;
3031debfc3dSmrg   struct gomp_team *team = thr->ts.team;
3041debfc3dSmrg   gomp_ull nthreads = team ? team->nthreads : 1;
3051debfc3dSmrg   gomp_ull start, end, nend, incr;
3061debfc3dSmrg   gomp_ull chunk_size;
3071debfc3dSmrg 
3081debfc3dSmrg   start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
3091debfc3dSmrg   end = ws->end_ull;
3101debfc3dSmrg   incr = ws->incr_ull;
3111debfc3dSmrg   chunk_size = ws->chunk_size_ull;
3121debfc3dSmrg 
3131debfc3dSmrg   while (1)
3141debfc3dSmrg     {
3151debfc3dSmrg       gomp_ull n, q;
3161debfc3dSmrg       gomp_ull tmp;
3171debfc3dSmrg 
3181debfc3dSmrg       if (start == end)
3191debfc3dSmrg 	return false;
3201debfc3dSmrg 
3211debfc3dSmrg       if (__builtin_expect (ws->mode, 0) == 0)
3221debfc3dSmrg 	n = (end - start) / incr;
3231debfc3dSmrg       else
3241debfc3dSmrg 	n = (start - end) / -incr;
3251debfc3dSmrg       q = (n + nthreads - 1) / nthreads;
3261debfc3dSmrg 
3271debfc3dSmrg       if (q < chunk_size)
3281debfc3dSmrg 	q = chunk_size;
3291debfc3dSmrg       if (__builtin_expect (q <= n, 1))
3301debfc3dSmrg 	nend = start + q * incr;
3311debfc3dSmrg       else
3321debfc3dSmrg 	nend = end;
3331debfc3dSmrg 
3341debfc3dSmrg       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
3351debfc3dSmrg       if (__builtin_expect (tmp == start, 1))
3361debfc3dSmrg 	break;
3371debfc3dSmrg 
3381debfc3dSmrg       start = tmp;
3391debfc3dSmrg     }
3401debfc3dSmrg 
3411debfc3dSmrg   *pstart = start;
3421debfc3dSmrg   *pend = nend;
3431debfc3dSmrg   return true;
3441debfc3dSmrg }
3451debfc3dSmrg #endif /* HAVE_SYNC_BUILTINS */
346