xref: /netbsd-src/external/gpl3/gcc.old/dist/libgomp/iter.c (revision 8feb0f0b7eaff0608f8350bbfa3098827b4bb91b)
1*8feb0f0bSmrg /* Copyright (C) 2005-2020 Free Software Foundation, Inc.
21debfc3dSmrg    Contributed by Richard Henderson <rth@redhat.com>.
31debfc3dSmrg 
41debfc3dSmrg    This file is part of the GNU Offloading and Multi Processing Library
51debfc3dSmrg    (libgomp).
61debfc3dSmrg 
71debfc3dSmrg    Libgomp is free software; you can redistribute it and/or modify it
81debfc3dSmrg    under the terms of the GNU General Public License as published by
91debfc3dSmrg    the Free Software Foundation; either version 3, or (at your option)
101debfc3dSmrg    any later version.
111debfc3dSmrg 
121debfc3dSmrg    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
131debfc3dSmrg    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
141debfc3dSmrg    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
151debfc3dSmrg    more details.
161debfc3dSmrg 
171debfc3dSmrg    Under Section 7 of GPL version 3, you are granted additional
181debfc3dSmrg    permissions described in the GCC Runtime Library Exception, version
191debfc3dSmrg    3.1, as published by the Free Software Foundation.
201debfc3dSmrg 
211debfc3dSmrg    You should have received a copy of the GNU General Public License and
221debfc3dSmrg    a copy of the GCC Runtime Library Exception along with this program;
231debfc3dSmrg    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
241debfc3dSmrg    <http://www.gnu.org/licenses/>.  */
251debfc3dSmrg 
261debfc3dSmrg /* This file contains routines for managing work-share iteration, both
271debfc3dSmrg    for loops and sections.  */
281debfc3dSmrg 
291debfc3dSmrg #include "libgomp.h"
301debfc3dSmrg #include <stdlib.h>
311debfc3dSmrg 
321debfc3dSmrg 
331debfc3dSmrg /* This function implements the STATIC scheduling method.  The caller should
341debfc3dSmrg    iterate *pstart <= x < *pend.  Return zero if there are more iterations
351debfc3dSmrg    to perform; nonzero if not.  Return less than 0 if this thread had
361debfc3dSmrg    received the absolutely last iteration.  */
371debfc3dSmrg 
381debfc3dSmrg int
gomp_iter_static_next(long * pstart,long * pend)391debfc3dSmrg gomp_iter_static_next (long *pstart, long *pend)
401debfc3dSmrg {
411debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
421debfc3dSmrg   struct gomp_team *team = thr->ts.team;
431debfc3dSmrg   struct gomp_work_share *ws = thr->ts.work_share;
441debfc3dSmrg   unsigned long nthreads = team ? team->nthreads : 1;
451debfc3dSmrg 
461debfc3dSmrg   if (thr->ts.static_trip == -1)
471debfc3dSmrg     return -1;
481debfc3dSmrg 
491debfc3dSmrg   /* Quick test for degenerate teams and orphaned constructs.  */
501debfc3dSmrg   if (nthreads == 1)
511debfc3dSmrg     {
521debfc3dSmrg       *pstart = ws->next;
531debfc3dSmrg       *pend = ws->end;
541debfc3dSmrg       thr->ts.static_trip = -1;
551debfc3dSmrg       return ws->next == ws->end;
561debfc3dSmrg     }
571debfc3dSmrg 
581debfc3dSmrg   /* We interpret chunk_size zero as "unspecified", which means that we
591debfc3dSmrg      should break up the iterations such that each thread makes only one
601debfc3dSmrg      trip through the outer loop.  */
611debfc3dSmrg   if (ws->chunk_size == 0)
621debfc3dSmrg     {
631debfc3dSmrg       unsigned long n, q, i, t;
641debfc3dSmrg       unsigned long s0, e0;
651debfc3dSmrg       long s, e;
661debfc3dSmrg 
671debfc3dSmrg       if (thr->ts.static_trip > 0)
681debfc3dSmrg 	return 1;
691debfc3dSmrg 
701debfc3dSmrg       /* Compute the total number of iterations.  */
711debfc3dSmrg       s = ws->incr + (ws->incr > 0 ? -1 : 1);
721debfc3dSmrg       n = (ws->end - ws->next + s) / ws->incr;
731debfc3dSmrg       i = thr->ts.team_id;
741debfc3dSmrg 
751debfc3dSmrg       /* Compute the "zero-based" start and end points.  That is, as
761debfc3dSmrg          if the loop began at zero and incremented by one.  */
771debfc3dSmrg       q = n / nthreads;
781debfc3dSmrg       t = n % nthreads;
791debfc3dSmrg       if (i < t)
801debfc3dSmrg 	{
811debfc3dSmrg 	  t = 0;
821debfc3dSmrg 	  q++;
831debfc3dSmrg 	}
841debfc3dSmrg       s0 = q * i + t;
851debfc3dSmrg       e0 = s0 + q;
861debfc3dSmrg 
871debfc3dSmrg       /* Notice when no iterations allocated for this thread.  */
881debfc3dSmrg       if (s0 >= e0)
891debfc3dSmrg 	{
901debfc3dSmrg 	  thr->ts.static_trip = 1;
911debfc3dSmrg 	  return 1;
921debfc3dSmrg 	}
931debfc3dSmrg 
941debfc3dSmrg       /* Transform these to the actual start and end numbers.  */
951debfc3dSmrg       s = (long)s0 * ws->incr + ws->next;
961debfc3dSmrg       e = (long)e0 * ws->incr + ws->next;
971debfc3dSmrg 
981debfc3dSmrg       *pstart = s;
991debfc3dSmrg       *pend = e;
1001debfc3dSmrg       thr->ts.static_trip = (e0 == n ? -1 : 1);
1011debfc3dSmrg       return 0;
1021debfc3dSmrg     }
1031debfc3dSmrg   else
1041debfc3dSmrg     {
1051debfc3dSmrg       unsigned long n, s0, e0, i, c;
1061debfc3dSmrg       long s, e;
1071debfc3dSmrg 
1081debfc3dSmrg       /* Otherwise, each thread gets exactly chunk_size iterations
1091debfc3dSmrg 	 (if available) each time through the loop.  */
1101debfc3dSmrg 
1111debfc3dSmrg       s = ws->incr + (ws->incr > 0 ? -1 : 1);
1121debfc3dSmrg       n = (ws->end - ws->next + s) / ws->incr;
1131debfc3dSmrg       i = thr->ts.team_id;
1141debfc3dSmrg       c = ws->chunk_size;
1151debfc3dSmrg 
1161debfc3dSmrg       /* Initial guess is a C sized chunk positioned nthreads iterations
1171debfc3dSmrg 	 in, offset by our thread number.  */
1181debfc3dSmrg       s0 = (thr->ts.static_trip * nthreads + i) * c;
1191debfc3dSmrg       e0 = s0 + c;
1201debfc3dSmrg 
1211debfc3dSmrg       /* Detect overflow.  */
1221debfc3dSmrg       if (s0 >= n)
1231debfc3dSmrg 	return 1;
1241debfc3dSmrg       if (e0 > n)
1251debfc3dSmrg 	e0 = n;
1261debfc3dSmrg 
1271debfc3dSmrg       /* Transform these to the actual start and end numbers.  */
1281debfc3dSmrg       s = (long)s0 * ws->incr + ws->next;
1291debfc3dSmrg       e = (long)e0 * ws->incr + ws->next;
1301debfc3dSmrg 
1311debfc3dSmrg       *pstart = s;
1321debfc3dSmrg       *pend = e;
1331debfc3dSmrg 
1341debfc3dSmrg       if (e0 == n)
1351debfc3dSmrg 	thr->ts.static_trip = -1;
1361debfc3dSmrg       else
1371debfc3dSmrg 	thr->ts.static_trip++;
1381debfc3dSmrg       return 0;
1391debfc3dSmrg     }
1401debfc3dSmrg }
1411debfc3dSmrg 
1421debfc3dSmrg 
1431debfc3dSmrg /* This function implements the DYNAMIC scheduling method.  Arguments are
1441debfc3dSmrg    as for gomp_iter_static_next.  This function must be called with ws->lock
1451debfc3dSmrg    held.  */
1461debfc3dSmrg 
1471debfc3dSmrg bool
gomp_iter_dynamic_next_locked(long * pstart,long * pend)1481debfc3dSmrg gomp_iter_dynamic_next_locked (long *pstart, long *pend)
1491debfc3dSmrg {
1501debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
1511debfc3dSmrg   struct gomp_work_share *ws = thr->ts.work_share;
1521debfc3dSmrg   long start, end, chunk, left;
1531debfc3dSmrg 
1541debfc3dSmrg   start = ws->next;
1551debfc3dSmrg   if (start == ws->end)
1561debfc3dSmrg     return false;
1571debfc3dSmrg 
1581debfc3dSmrg   chunk = ws->chunk_size;
1591debfc3dSmrg   left = ws->end - start;
1601debfc3dSmrg   if (ws->incr < 0)
1611debfc3dSmrg     {
1621debfc3dSmrg       if (chunk < left)
1631debfc3dSmrg 	chunk = left;
1641debfc3dSmrg     }
1651debfc3dSmrg   else
1661debfc3dSmrg     {
1671debfc3dSmrg       if (chunk > left)
1681debfc3dSmrg 	chunk = left;
1691debfc3dSmrg     }
1701debfc3dSmrg   end = start + chunk;
1711debfc3dSmrg 
1721debfc3dSmrg   ws->next = end;
1731debfc3dSmrg   *pstart = start;
1741debfc3dSmrg   *pend = end;
1751debfc3dSmrg   return true;
1761debfc3dSmrg }
1771debfc3dSmrg 
1781debfc3dSmrg 
1791debfc3dSmrg #ifdef HAVE_SYNC_BUILTINS
1801debfc3dSmrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
1811debfc3dSmrg    instead.  Note that the only memory value that changes is ws->next.  */
1821debfc3dSmrg 
1831debfc3dSmrg bool
gomp_iter_dynamic_next(long * pstart,long * pend)1841debfc3dSmrg gomp_iter_dynamic_next (long *pstart, long *pend)
1851debfc3dSmrg {
1861debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
1871debfc3dSmrg   struct gomp_work_share *ws = thr->ts.work_share;
1881debfc3dSmrg   long start, end, nend, chunk, incr;
1891debfc3dSmrg 
1901debfc3dSmrg   end = ws->end;
1911debfc3dSmrg   incr = ws->incr;
1921debfc3dSmrg   chunk = ws->chunk_size;
1931debfc3dSmrg 
1941debfc3dSmrg   if (__builtin_expect (ws->mode, 1))
1951debfc3dSmrg     {
1961debfc3dSmrg       long tmp = __sync_fetch_and_add (&ws->next, chunk);
1971debfc3dSmrg       if (incr > 0)
1981debfc3dSmrg 	{
1991debfc3dSmrg 	  if (tmp >= end)
2001debfc3dSmrg 	    return false;
2011debfc3dSmrg 	  nend = tmp + chunk;
2021debfc3dSmrg 	  if (nend > end)
2031debfc3dSmrg 	    nend = end;
2041debfc3dSmrg 	  *pstart = tmp;
2051debfc3dSmrg 	  *pend = nend;
2061debfc3dSmrg 	  return true;
2071debfc3dSmrg 	}
2081debfc3dSmrg       else
2091debfc3dSmrg 	{
2101debfc3dSmrg 	  if (tmp <= end)
2111debfc3dSmrg 	    return false;
2121debfc3dSmrg 	  nend = tmp + chunk;
2131debfc3dSmrg 	  if (nend < end)
2141debfc3dSmrg 	    nend = end;
2151debfc3dSmrg 	  *pstart = tmp;
2161debfc3dSmrg 	  *pend = nend;
2171debfc3dSmrg 	  return true;
2181debfc3dSmrg 	}
2191debfc3dSmrg     }
2201debfc3dSmrg 
2211debfc3dSmrg   start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
2221debfc3dSmrg   while (1)
2231debfc3dSmrg     {
2241debfc3dSmrg       long left = end - start;
2251debfc3dSmrg       long tmp;
2261debfc3dSmrg 
2271debfc3dSmrg       if (start == end)
2281debfc3dSmrg 	return false;
2291debfc3dSmrg 
2301debfc3dSmrg       if (incr < 0)
2311debfc3dSmrg 	{
2321debfc3dSmrg 	  if (chunk < left)
2331debfc3dSmrg 	    chunk = left;
2341debfc3dSmrg 	}
2351debfc3dSmrg       else
2361debfc3dSmrg 	{
2371debfc3dSmrg 	  if (chunk > left)
2381debfc3dSmrg 	    chunk = left;
2391debfc3dSmrg 	}
2401debfc3dSmrg       nend = start + chunk;
2411debfc3dSmrg 
2421debfc3dSmrg       tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
2431debfc3dSmrg       if (__builtin_expect (tmp == start, 1))
2441debfc3dSmrg 	break;
2451debfc3dSmrg 
2461debfc3dSmrg       start = tmp;
2471debfc3dSmrg     }
2481debfc3dSmrg 
2491debfc3dSmrg   *pstart = start;
2501debfc3dSmrg   *pend = nend;
2511debfc3dSmrg   return true;
2521debfc3dSmrg }
2531debfc3dSmrg #endif /* HAVE_SYNC_BUILTINS */
2541debfc3dSmrg 
2551debfc3dSmrg 
2561debfc3dSmrg /* This function implements the GUIDED scheduling method.  Arguments are
2571debfc3dSmrg    as for gomp_iter_static_next.  This function must be called with the
2581debfc3dSmrg    work share lock held.  */
2591debfc3dSmrg 
2601debfc3dSmrg bool
gomp_iter_guided_next_locked(long * pstart,long * pend)2611debfc3dSmrg gomp_iter_guided_next_locked (long *pstart, long *pend)
2621debfc3dSmrg {
2631debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
2641debfc3dSmrg   struct gomp_work_share *ws = thr->ts.work_share;
2651debfc3dSmrg   struct gomp_team *team = thr->ts.team;
2661debfc3dSmrg   unsigned long nthreads = team ? team->nthreads : 1;
2671debfc3dSmrg   unsigned long n, q;
2681debfc3dSmrg   long start, end;
2691debfc3dSmrg 
2701debfc3dSmrg   if (ws->next == ws->end)
2711debfc3dSmrg     return false;
2721debfc3dSmrg 
2731debfc3dSmrg   start = ws->next;
2741debfc3dSmrg   n = (ws->end - start) / ws->incr;
2751debfc3dSmrg   q = (n + nthreads - 1) / nthreads;
2761debfc3dSmrg 
2771debfc3dSmrg   if (q < ws->chunk_size)
2781debfc3dSmrg     q = ws->chunk_size;
2791debfc3dSmrg   if (q <= n)
2801debfc3dSmrg     end = start + q * ws->incr;
2811debfc3dSmrg   else
2821debfc3dSmrg     end = ws->end;
2831debfc3dSmrg 
2841debfc3dSmrg   ws->next = end;
2851debfc3dSmrg   *pstart = start;
2861debfc3dSmrg   *pend = end;
2871debfc3dSmrg   return true;
2881debfc3dSmrg }
2891debfc3dSmrg 
2901debfc3dSmrg #ifdef HAVE_SYNC_BUILTINS
2911debfc3dSmrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
2921debfc3dSmrg    instead.  Note that the only memory value that changes is ws->next.  */
2931debfc3dSmrg 
2941debfc3dSmrg bool
gomp_iter_guided_next(long * pstart,long * pend)2951debfc3dSmrg gomp_iter_guided_next (long *pstart, long *pend)
2961debfc3dSmrg {
2971debfc3dSmrg   struct gomp_thread *thr = gomp_thread ();
2981debfc3dSmrg   struct gomp_work_share *ws = thr->ts.work_share;
2991debfc3dSmrg   struct gomp_team *team = thr->ts.team;
3001debfc3dSmrg   unsigned long nthreads = team ? team->nthreads : 1;
3011debfc3dSmrg   long start, end, nend, incr;
3021debfc3dSmrg   unsigned long chunk_size;
3031debfc3dSmrg 
3041debfc3dSmrg   start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
3051debfc3dSmrg   end = ws->end;
3061debfc3dSmrg   incr = ws->incr;
3071debfc3dSmrg   chunk_size = ws->chunk_size;
3081debfc3dSmrg 
3091debfc3dSmrg   while (1)
3101debfc3dSmrg     {
3111debfc3dSmrg       unsigned long n, q;
3121debfc3dSmrg       long tmp;
3131debfc3dSmrg 
3141debfc3dSmrg       if (start == end)
3151debfc3dSmrg 	return false;
3161debfc3dSmrg 
3171debfc3dSmrg       n = (end - start) / incr;
3181debfc3dSmrg       q = (n + nthreads - 1) / nthreads;
3191debfc3dSmrg 
3201debfc3dSmrg       if (q < chunk_size)
3211debfc3dSmrg 	q = chunk_size;
3221debfc3dSmrg       if (__builtin_expect (q <= n, 1))
3231debfc3dSmrg 	nend = start + q * incr;
3241debfc3dSmrg       else
3251debfc3dSmrg 	nend = end;
3261debfc3dSmrg 
3271debfc3dSmrg       tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
3281debfc3dSmrg       if (__builtin_expect (tmp == start, 1))
3291debfc3dSmrg 	break;
3301debfc3dSmrg 
3311debfc3dSmrg       start = tmp;
3321debfc3dSmrg     }
3331debfc3dSmrg 
3341debfc3dSmrg   *pstart = start;
3351debfc3dSmrg   *pend = nend;
3361debfc3dSmrg   return true;
3371debfc3dSmrg }
3381debfc3dSmrg #endif /* HAVE_SYNC_BUILTINS */
339