xref: /netbsd-src/external/gpl3/gcc/dist/libgomp/iter.c (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1*b1e83836Smrg /* Copyright (C) 2005-2022 Free Software Foundation, Inc.
24fee23f9Smrg    Contributed by Richard Henderson <rth@redhat.com>.
34fee23f9Smrg 
44d5abbe8Smrg    This file is part of the GNU Offloading and Multi Processing Library
54d5abbe8Smrg    (libgomp).
64fee23f9Smrg 
74fee23f9Smrg    Libgomp is free software; you can redistribute it and/or modify it
84fee23f9Smrg    under the terms of the GNU General Public License as published by
94fee23f9Smrg    the Free Software Foundation; either version 3, or (at your option)
104fee23f9Smrg    any later version.
114fee23f9Smrg 
124fee23f9Smrg    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
134fee23f9Smrg    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
144fee23f9Smrg    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
154fee23f9Smrg    more details.
164fee23f9Smrg 
174fee23f9Smrg    Under Section 7 of GPL version 3, you are granted additional
184fee23f9Smrg    permissions described in the GCC Runtime Library Exception, version
194fee23f9Smrg    3.1, as published by the Free Software Foundation.
204fee23f9Smrg 
214fee23f9Smrg    You should have received a copy of the GNU General Public License and
224fee23f9Smrg    a copy of the GCC Runtime Library Exception along with this program;
234fee23f9Smrg    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
244fee23f9Smrg    <http://www.gnu.org/licenses/>.  */
254fee23f9Smrg 
264fee23f9Smrg /* This file contains routines for managing work-share iteration, both
274fee23f9Smrg    for loops and sections.  */
284fee23f9Smrg 
294fee23f9Smrg #include "libgomp.h"
304fee23f9Smrg #include <stdlib.h>
314fee23f9Smrg 
324fee23f9Smrg 
334fee23f9Smrg /* This function implements the STATIC scheduling method.  The caller should
344fee23f9Smrg    iterate *pstart <= x < *pend.  Return zero if there are more iterations
354fee23f9Smrg    to perform; nonzero if not.  Return less than 0 if this thread had
364fee23f9Smrg    received the absolutely last iteration.  */
374fee23f9Smrg 
384fee23f9Smrg int
gomp_iter_static_next(long * pstart,long * pend)394fee23f9Smrg gomp_iter_static_next (long *pstart, long *pend)
404fee23f9Smrg {
414fee23f9Smrg   struct gomp_thread *thr = gomp_thread ();
424fee23f9Smrg   struct gomp_team *team = thr->ts.team;
434fee23f9Smrg   struct gomp_work_share *ws = thr->ts.work_share;
444fee23f9Smrg   unsigned long nthreads = team ? team->nthreads : 1;
454fee23f9Smrg 
464fee23f9Smrg   if (thr->ts.static_trip == -1)
474fee23f9Smrg     return -1;
484fee23f9Smrg 
494fee23f9Smrg   /* Quick test for degenerate teams and orphaned constructs.  */
504fee23f9Smrg   if (nthreads == 1)
514fee23f9Smrg     {
524fee23f9Smrg       *pstart = ws->next;
534fee23f9Smrg       *pend = ws->end;
544fee23f9Smrg       thr->ts.static_trip = -1;
554fee23f9Smrg       return ws->next == ws->end;
564fee23f9Smrg     }
574fee23f9Smrg 
584fee23f9Smrg   /* We interpret chunk_size zero as "unspecified", which means that we
594fee23f9Smrg      should break up the iterations such that each thread makes only one
604fee23f9Smrg      trip through the outer loop.  */
614fee23f9Smrg   if (ws->chunk_size == 0)
624fee23f9Smrg     {
6348fb7bfaSmrg       unsigned long n, q, i, t;
644fee23f9Smrg       unsigned long s0, e0;
654fee23f9Smrg       long s, e;
664fee23f9Smrg 
674fee23f9Smrg       if (thr->ts.static_trip > 0)
684fee23f9Smrg 	return 1;
694fee23f9Smrg 
704fee23f9Smrg       /* Compute the total number of iterations.  */
714fee23f9Smrg       s = ws->incr + (ws->incr > 0 ? -1 : 1);
724fee23f9Smrg       n = (ws->end - ws->next + s) / ws->incr;
734fee23f9Smrg       i = thr->ts.team_id;
744fee23f9Smrg 
754fee23f9Smrg       /* Compute the "zero-based" start and end points.  That is, as
764fee23f9Smrg          if the loop began at zero and incremented by one.  */
774fee23f9Smrg       q = n / nthreads;
7848fb7bfaSmrg       t = n % nthreads;
7948fb7bfaSmrg       if (i < t)
8048fb7bfaSmrg 	{
8148fb7bfaSmrg 	  t = 0;
8248fb7bfaSmrg 	  q++;
8348fb7bfaSmrg 	}
8448fb7bfaSmrg       s0 = q * i + t;
854fee23f9Smrg       e0 = s0 + q;
864fee23f9Smrg 
874fee23f9Smrg       /* Notice when no iterations allocated for this thread.  */
884fee23f9Smrg       if (s0 >= e0)
894fee23f9Smrg 	{
904fee23f9Smrg 	  thr->ts.static_trip = 1;
914fee23f9Smrg 	  return 1;
924fee23f9Smrg 	}
934fee23f9Smrg 
944fee23f9Smrg       /* Transform these to the actual start and end numbers.  */
954fee23f9Smrg       s = (long)s0 * ws->incr + ws->next;
964fee23f9Smrg       e = (long)e0 * ws->incr + ws->next;
974fee23f9Smrg 
984fee23f9Smrg       *pstart = s;
994fee23f9Smrg       *pend = e;
1004fee23f9Smrg       thr->ts.static_trip = (e0 == n ? -1 : 1);
1014fee23f9Smrg       return 0;
1024fee23f9Smrg     }
1034fee23f9Smrg   else
1044fee23f9Smrg     {
1054fee23f9Smrg       unsigned long n, s0, e0, i, c;
1064fee23f9Smrg       long s, e;
1074fee23f9Smrg 
1084fee23f9Smrg       /* Otherwise, each thread gets exactly chunk_size iterations
1094fee23f9Smrg 	 (if available) each time through the loop.  */
1104fee23f9Smrg 
1114fee23f9Smrg       s = ws->incr + (ws->incr > 0 ? -1 : 1);
1124fee23f9Smrg       n = (ws->end - ws->next + s) / ws->incr;
1134fee23f9Smrg       i = thr->ts.team_id;
1144fee23f9Smrg       c = ws->chunk_size;
1154fee23f9Smrg 
1164fee23f9Smrg       /* Initial guess is a C sized chunk positioned nthreads iterations
1174fee23f9Smrg 	 in, offset by our thread number.  */
1184fee23f9Smrg       s0 = (thr->ts.static_trip * nthreads + i) * c;
1194fee23f9Smrg       e0 = s0 + c;
1204fee23f9Smrg 
1214fee23f9Smrg       /* Detect overflow.  */
1224fee23f9Smrg       if (s0 >= n)
1234fee23f9Smrg 	return 1;
1244fee23f9Smrg       if (e0 > n)
1254fee23f9Smrg 	e0 = n;
1264fee23f9Smrg 
1274fee23f9Smrg       /* Transform these to the actual start and end numbers.  */
1284fee23f9Smrg       s = (long)s0 * ws->incr + ws->next;
1294fee23f9Smrg       e = (long)e0 * ws->incr + ws->next;
1304fee23f9Smrg 
1314fee23f9Smrg       *pstart = s;
1324fee23f9Smrg       *pend = e;
1334fee23f9Smrg 
1344fee23f9Smrg       if (e0 == n)
1354fee23f9Smrg 	thr->ts.static_trip = -1;
1364fee23f9Smrg       else
1374fee23f9Smrg 	thr->ts.static_trip++;
1384fee23f9Smrg       return 0;
1394fee23f9Smrg     }
1404fee23f9Smrg }
1414fee23f9Smrg 
1424fee23f9Smrg 
1434fee23f9Smrg /* This function implements the DYNAMIC scheduling method.  Arguments are
1444fee23f9Smrg    as for gomp_iter_static_next.  This function must be called with ws->lock
1454fee23f9Smrg    held.  */
1464fee23f9Smrg 
1474fee23f9Smrg bool
gomp_iter_dynamic_next_locked(long * pstart,long * pend)1484fee23f9Smrg gomp_iter_dynamic_next_locked (long *pstart, long *pend)
1494fee23f9Smrg {
1504fee23f9Smrg   struct gomp_thread *thr = gomp_thread ();
1514fee23f9Smrg   struct gomp_work_share *ws = thr->ts.work_share;
1524fee23f9Smrg   long start, end, chunk, left;
1534fee23f9Smrg 
1544fee23f9Smrg   start = ws->next;
1554fee23f9Smrg   if (start == ws->end)
1564fee23f9Smrg     return false;
1574fee23f9Smrg 
1584fee23f9Smrg   chunk = ws->chunk_size;
1594fee23f9Smrg   left = ws->end - start;
1604fee23f9Smrg   if (ws->incr < 0)
1614fee23f9Smrg     {
1624fee23f9Smrg       if (chunk < left)
1634fee23f9Smrg 	chunk = left;
1644fee23f9Smrg     }
1654fee23f9Smrg   else
1664fee23f9Smrg     {
1674fee23f9Smrg       if (chunk > left)
1684fee23f9Smrg 	chunk = left;
1694fee23f9Smrg     }
1704fee23f9Smrg   end = start + chunk;
1714fee23f9Smrg 
1724fee23f9Smrg   ws->next = end;
1734fee23f9Smrg   *pstart = start;
1744fee23f9Smrg   *pend = end;
1754fee23f9Smrg   return true;
1764fee23f9Smrg }
1774fee23f9Smrg 
1784fee23f9Smrg 
1794fee23f9Smrg #ifdef HAVE_SYNC_BUILTINS
1804fee23f9Smrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
1814fee23f9Smrg    instead.  Note that the only memory value that changes is ws->next.  */
1824fee23f9Smrg 
1834fee23f9Smrg bool
gomp_iter_dynamic_next(long * pstart,long * pend)1844fee23f9Smrg gomp_iter_dynamic_next (long *pstart, long *pend)
1854fee23f9Smrg {
1864fee23f9Smrg   struct gomp_thread *thr = gomp_thread ();
1874fee23f9Smrg   struct gomp_work_share *ws = thr->ts.work_share;
1884fee23f9Smrg   long start, end, nend, chunk, incr;
1894fee23f9Smrg 
1904fee23f9Smrg   end = ws->end;
1914fee23f9Smrg   incr = ws->incr;
1924fee23f9Smrg   chunk = ws->chunk_size;
1934fee23f9Smrg 
1944fee23f9Smrg   if (__builtin_expect (ws->mode, 1))
1954fee23f9Smrg     {
1964fee23f9Smrg       long tmp = __sync_fetch_and_add (&ws->next, chunk);
1974fee23f9Smrg       if (incr > 0)
1984fee23f9Smrg 	{
1994fee23f9Smrg 	  if (tmp >= end)
2004fee23f9Smrg 	    return false;
2014fee23f9Smrg 	  nend = tmp + chunk;
2024fee23f9Smrg 	  if (nend > end)
2034fee23f9Smrg 	    nend = end;
2044fee23f9Smrg 	  *pstart = tmp;
2054fee23f9Smrg 	  *pend = nend;
2064fee23f9Smrg 	  return true;
2074fee23f9Smrg 	}
2084fee23f9Smrg       else
2094fee23f9Smrg 	{
2104fee23f9Smrg 	  if (tmp <= end)
2114fee23f9Smrg 	    return false;
2124fee23f9Smrg 	  nend = tmp + chunk;
2134fee23f9Smrg 	  if (nend < end)
2144fee23f9Smrg 	    nend = end;
2154fee23f9Smrg 	  *pstart = tmp;
2164fee23f9Smrg 	  *pend = nend;
2174fee23f9Smrg 	  return true;
2184fee23f9Smrg 	}
2194fee23f9Smrg     }
2204fee23f9Smrg 
2214d5abbe8Smrg   start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
2224fee23f9Smrg   while (1)
2234fee23f9Smrg     {
2244fee23f9Smrg       long left = end - start;
2254fee23f9Smrg       long tmp;
2264fee23f9Smrg 
2274fee23f9Smrg       if (start == end)
2284fee23f9Smrg 	return false;
2294fee23f9Smrg 
2304fee23f9Smrg       if (incr < 0)
2314fee23f9Smrg 	{
2324fee23f9Smrg 	  if (chunk < left)
2334fee23f9Smrg 	    chunk = left;
2344fee23f9Smrg 	}
2354fee23f9Smrg       else
2364fee23f9Smrg 	{
2374fee23f9Smrg 	  if (chunk > left)
2384fee23f9Smrg 	    chunk = left;
2394fee23f9Smrg 	}
2404fee23f9Smrg       nend = start + chunk;
2414fee23f9Smrg 
2424fee23f9Smrg       tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
2434fee23f9Smrg       if (__builtin_expect (tmp == start, 1))
2444fee23f9Smrg 	break;
2454fee23f9Smrg 
2464fee23f9Smrg       start = tmp;
2474fee23f9Smrg     }
2484fee23f9Smrg 
2494fee23f9Smrg   *pstart = start;
2504fee23f9Smrg   *pend = nend;
2514fee23f9Smrg   return true;
2524fee23f9Smrg }
2534fee23f9Smrg #endif /* HAVE_SYNC_BUILTINS */
2544fee23f9Smrg 
2554fee23f9Smrg 
2564fee23f9Smrg /* This function implements the GUIDED scheduling method.  Arguments are
2574fee23f9Smrg    as for gomp_iter_static_next.  This function must be called with the
2584fee23f9Smrg    work share lock held.  */
2594fee23f9Smrg 
2604fee23f9Smrg bool
gomp_iter_guided_next_locked(long * pstart,long * pend)2614fee23f9Smrg gomp_iter_guided_next_locked (long *pstart, long *pend)
2624fee23f9Smrg {
2634fee23f9Smrg   struct gomp_thread *thr = gomp_thread ();
2644fee23f9Smrg   struct gomp_work_share *ws = thr->ts.work_share;
2654fee23f9Smrg   struct gomp_team *team = thr->ts.team;
2664fee23f9Smrg   unsigned long nthreads = team ? team->nthreads : 1;
2674fee23f9Smrg   unsigned long n, q;
2684fee23f9Smrg   long start, end;
2694fee23f9Smrg 
2704fee23f9Smrg   if (ws->next == ws->end)
2714fee23f9Smrg     return false;
2724fee23f9Smrg 
2734fee23f9Smrg   start = ws->next;
2744fee23f9Smrg   n = (ws->end - start) / ws->incr;
2754fee23f9Smrg   q = (n + nthreads - 1) / nthreads;
2764fee23f9Smrg 
2774fee23f9Smrg   if (q < ws->chunk_size)
2784fee23f9Smrg     q = ws->chunk_size;
2794fee23f9Smrg   if (q <= n)
2804fee23f9Smrg     end = start + q * ws->incr;
2814fee23f9Smrg   else
2824fee23f9Smrg     end = ws->end;
2834fee23f9Smrg 
2844fee23f9Smrg   ws->next = end;
2854fee23f9Smrg   *pstart = start;
2864fee23f9Smrg   *pend = end;
2874fee23f9Smrg   return true;
2884fee23f9Smrg }
2894fee23f9Smrg 
2904fee23f9Smrg #ifdef HAVE_SYNC_BUILTINS
2914fee23f9Smrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
2924fee23f9Smrg    instead.  Note that the only memory value that changes is ws->next.  */
2934fee23f9Smrg 
2944fee23f9Smrg bool
gomp_iter_guided_next(long * pstart,long * pend)2954fee23f9Smrg gomp_iter_guided_next (long *pstart, long *pend)
2964fee23f9Smrg {
2974fee23f9Smrg   struct gomp_thread *thr = gomp_thread ();
2984fee23f9Smrg   struct gomp_work_share *ws = thr->ts.work_share;
2994fee23f9Smrg   struct gomp_team *team = thr->ts.team;
3004fee23f9Smrg   unsigned long nthreads = team ? team->nthreads : 1;
3014fee23f9Smrg   long start, end, nend, incr;
3024fee23f9Smrg   unsigned long chunk_size;
3034fee23f9Smrg 
3044d5abbe8Smrg   start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
3054fee23f9Smrg   end = ws->end;
3064fee23f9Smrg   incr = ws->incr;
3074fee23f9Smrg   chunk_size = ws->chunk_size;
3084fee23f9Smrg 
3094fee23f9Smrg   while (1)
3104fee23f9Smrg     {
3114fee23f9Smrg       unsigned long n, q;
3124fee23f9Smrg       long tmp;
3134fee23f9Smrg 
3144fee23f9Smrg       if (start == end)
3154fee23f9Smrg 	return false;
3164fee23f9Smrg 
3174fee23f9Smrg       n = (end - start) / incr;
3184fee23f9Smrg       q = (n + nthreads - 1) / nthreads;
3194fee23f9Smrg 
3204fee23f9Smrg       if (q < chunk_size)
3214fee23f9Smrg 	q = chunk_size;
3224fee23f9Smrg       if (__builtin_expect (q <= n, 1))
3234fee23f9Smrg 	nend = start + q * incr;
3244fee23f9Smrg       else
3254fee23f9Smrg 	nend = end;
3264fee23f9Smrg 
3274fee23f9Smrg       tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
3284fee23f9Smrg       if (__builtin_expect (tmp == start, 1))
3294fee23f9Smrg 	break;
3304fee23f9Smrg 
3314fee23f9Smrg       start = tmp;
3324fee23f9Smrg     }
3334fee23f9Smrg 
3344fee23f9Smrg   *pstart = start;
3354fee23f9Smrg   *pend = nend;
3364fee23f9Smrg   return true;
3374fee23f9Smrg }
3384fee23f9Smrg #endif /* HAVE_SYNC_BUILTINS */
339