xref: /netbsd-src/external/gpl3/gcc/dist/libgomp/iter_ull.c (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1*b1e83836Smrg /* Copyright (C) 2005-2022 Free Software Foundation, Inc.
24fee23f9Smrg    Contributed by Richard Henderson <rth@redhat.com>.
34fee23f9Smrg 
44d5abbe8Smrg    This file is part of the GNU Offloading and Multi Processing Library
54d5abbe8Smrg    (libgomp).
64fee23f9Smrg 
74fee23f9Smrg    Libgomp is free software; you can redistribute it and/or modify it
84fee23f9Smrg    under the terms of the GNU General Public License as published by
94fee23f9Smrg    the Free Software Foundation; either version 3, or (at your option)
104fee23f9Smrg    any later version.
114fee23f9Smrg 
124fee23f9Smrg    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
134fee23f9Smrg    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
144fee23f9Smrg    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
154fee23f9Smrg    more details.
164fee23f9Smrg 
174fee23f9Smrg    Under Section 7 of GPL version 3, you are granted additional
184fee23f9Smrg    permissions described in the GCC Runtime Library Exception, version
194fee23f9Smrg    3.1, as published by the Free Software Foundation.
204fee23f9Smrg 
214fee23f9Smrg    You should have received a copy of the GNU General Public License and
224fee23f9Smrg    a copy of the GCC Runtime Library Exception along with this program;
234fee23f9Smrg    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
244fee23f9Smrg    <http://www.gnu.org/licenses/>.  */
254fee23f9Smrg 
264fee23f9Smrg /* This file contains routines for managing work-share iteration, both
274fee23f9Smrg    for loops and sections.  */
284fee23f9Smrg 
294fee23f9Smrg #include "libgomp.h"
304fee23f9Smrg #include <stdlib.h>
314fee23f9Smrg 
324fee23f9Smrg typedef unsigned long long gomp_ull;
334fee23f9Smrg 
344fee23f9Smrg /* This function implements the STATIC scheduling method.  The caller should
354fee23f9Smrg    iterate *pstart <= x < *pend.  Return zero if there are more iterations
364fee23f9Smrg    to perform; nonzero if not.  Return less than 0 if this thread had
374fee23f9Smrg    received the absolutely last iteration.  */
384fee23f9Smrg 
394fee23f9Smrg int
gomp_iter_ull_static_next(gomp_ull * pstart,gomp_ull * pend)404fee23f9Smrg gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
414fee23f9Smrg {
424fee23f9Smrg   struct gomp_thread *thr = gomp_thread ();
434fee23f9Smrg   struct gomp_team *team = thr->ts.team;
444fee23f9Smrg   struct gomp_work_share *ws = thr->ts.work_share;
454fee23f9Smrg   unsigned long nthreads = team ? team->nthreads : 1;
464fee23f9Smrg 
474fee23f9Smrg   if (thr->ts.static_trip == -1)
484fee23f9Smrg     return -1;
494fee23f9Smrg 
504fee23f9Smrg   /* Quick test for degenerate teams and orphaned constructs.  */
514fee23f9Smrg   if (nthreads == 1)
524fee23f9Smrg     {
534fee23f9Smrg       *pstart = ws->next_ull;
544fee23f9Smrg       *pend = ws->end_ull;
554fee23f9Smrg       thr->ts.static_trip = -1;
564fee23f9Smrg       return ws->next_ull == ws->end_ull;
574fee23f9Smrg     }
584fee23f9Smrg 
594fee23f9Smrg   /* We interpret chunk_size zero as "unspecified", which means that we
604fee23f9Smrg      should break up the iterations such that each thread makes only one
614fee23f9Smrg      trip through the outer loop.  */
624fee23f9Smrg   if (ws->chunk_size_ull == 0)
634fee23f9Smrg     {
6448fb7bfaSmrg       gomp_ull n, q, i, t, s0, e0, s, e;
654fee23f9Smrg 
664fee23f9Smrg       if (thr->ts.static_trip > 0)
674fee23f9Smrg 	return 1;
684fee23f9Smrg 
694fee23f9Smrg       /* Compute the total number of iterations.  */
704fee23f9Smrg       if (__builtin_expect (ws->mode, 0) == 0)
714fee23f9Smrg 	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
724fee23f9Smrg       else
734fee23f9Smrg 	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
744fee23f9Smrg       i = thr->ts.team_id;
754fee23f9Smrg 
764fee23f9Smrg       /* Compute the "zero-based" start and end points.  That is, as
774fee23f9Smrg 	 if the loop began at zero and incremented by one.  */
784fee23f9Smrg       q = n / nthreads;
7948fb7bfaSmrg       t = n % nthreads;
8048fb7bfaSmrg       if (i < t)
8148fb7bfaSmrg 	{
8248fb7bfaSmrg 	  t = 0;
8348fb7bfaSmrg 	  q++;
8448fb7bfaSmrg 	}
8548fb7bfaSmrg       s0 = q * i + t;
864fee23f9Smrg       e0 = s0 + q;
874fee23f9Smrg 
884fee23f9Smrg       /* Notice when no iterations allocated for this thread.  */
894fee23f9Smrg       if (s0 >= e0)
904fee23f9Smrg 	{
914fee23f9Smrg 	  thr->ts.static_trip = 1;
924fee23f9Smrg 	  return 1;
934fee23f9Smrg 	}
944fee23f9Smrg 
954fee23f9Smrg       /* Transform these to the actual start and end numbers.  */
964fee23f9Smrg       s = s0 * ws->incr_ull + ws->next_ull;
974fee23f9Smrg       e = e0 * ws->incr_ull + ws->next_ull;
984fee23f9Smrg 
994fee23f9Smrg       *pstart = s;
1004fee23f9Smrg       *pend = e;
1014fee23f9Smrg       thr->ts.static_trip = (e0 == n ? -1 : 1);
1024fee23f9Smrg       return 0;
1034fee23f9Smrg     }
1044fee23f9Smrg   else
1054fee23f9Smrg     {
1064fee23f9Smrg       gomp_ull n, s0, e0, i, c, s, e;
1074fee23f9Smrg 
1084fee23f9Smrg       /* Otherwise, each thread gets exactly chunk_size iterations
1094fee23f9Smrg 	 (if available) each time through the loop.  */
1104fee23f9Smrg 
1114fee23f9Smrg       if (__builtin_expect (ws->mode, 0) == 0)
1124fee23f9Smrg 	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
1134fee23f9Smrg       else
1144fee23f9Smrg 	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
1154fee23f9Smrg       i = thr->ts.team_id;
1164fee23f9Smrg       c = ws->chunk_size_ull;
1174fee23f9Smrg 
1184fee23f9Smrg       /* Initial guess is a C sized chunk positioned nthreads iterations
1194fee23f9Smrg 	 in, offset by our thread number.  */
1204fee23f9Smrg       s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
1214fee23f9Smrg       e0 = s0 + c;
1224fee23f9Smrg 
1234fee23f9Smrg       /* Detect overflow.  */
1244fee23f9Smrg       if (s0 >= n)
1254fee23f9Smrg 	return 1;
1264fee23f9Smrg       if (e0 > n)
1274fee23f9Smrg 	e0 = n;
1284fee23f9Smrg 
1294fee23f9Smrg       /* Transform these to the actual start and end numbers.  */
1304fee23f9Smrg       s = s0 * ws->incr_ull + ws->next_ull;
1314fee23f9Smrg       e = e0 * ws->incr_ull + ws->next_ull;
1324fee23f9Smrg 
1334fee23f9Smrg       *pstart = s;
1344fee23f9Smrg       *pend = e;
1354fee23f9Smrg 
1364fee23f9Smrg       if (e0 == n)
1374fee23f9Smrg 	thr->ts.static_trip = -1;
1384fee23f9Smrg       else
1394fee23f9Smrg 	thr->ts.static_trip++;
1404fee23f9Smrg       return 0;
1414fee23f9Smrg     }
1424fee23f9Smrg }
1434fee23f9Smrg 
1444fee23f9Smrg 
1454fee23f9Smrg /* This function implements the DYNAMIC scheduling method.  Arguments are
1464fee23f9Smrg    as for gomp_iter_ull_static_next.  This function must be called with
1474fee23f9Smrg    ws->lock held.  */
1484fee23f9Smrg 
1494fee23f9Smrg bool
gomp_iter_ull_dynamic_next_locked(gomp_ull * pstart,gomp_ull * pend)1504fee23f9Smrg gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
1514fee23f9Smrg {
1524fee23f9Smrg   struct gomp_thread *thr = gomp_thread ();
1534fee23f9Smrg   struct gomp_work_share *ws = thr->ts.work_share;
1544fee23f9Smrg   gomp_ull start, end, chunk, left;
1554fee23f9Smrg 
1564fee23f9Smrg   start = ws->next_ull;
1574fee23f9Smrg   if (start == ws->end_ull)
1584fee23f9Smrg     return false;
1594fee23f9Smrg 
1604fee23f9Smrg   chunk = ws->chunk_size_ull;
1614fee23f9Smrg   left = ws->end_ull - start;
1624fee23f9Smrg   if (__builtin_expect (ws->mode & 2, 0))
1634fee23f9Smrg     {
1644fee23f9Smrg       if (chunk < left)
1654fee23f9Smrg 	chunk = left;
1664fee23f9Smrg     }
1674fee23f9Smrg   else
1684fee23f9Smrg     {
1694fee23f9Smrg       if (chunk > left)
1704fee23f9Smrg 	chunk = left;
1714fee23f9Smrg     }
1724fee23f9Smrg   end = start + chunk;
1734fee23f9Smrg 
1744fee23f9Smrg   ws->next_ull = end;
1754fee23f9Smrg   *pstart = start;
1764fee23f9Smrg   *pend = end;
1774fee23f9Smrg   return true;
1784fee23f9Smrg }
1794fee23f9Smrg 
1804fee23f9Smrg 
1814fee23f9Smrg #if defined HAVE_SYNC_BUILTINS && defined __LP64__
1824fee23f9Smrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
1834fee23f9Smrg    instead.  Note that the only memory value that changes is ws->next_ull.  */
1844fee23f9Smrg 
1854fee23f9Smrg bool
gomp_iter_ull_dynamic_next(gomp_ull * pstart,gomp_ull * pend)1864fee23f9Smrg gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
1874fee23f9Smrg {
1884fee23f9Smrg   struct gomp_thread *thr = gomp_thread ();
1894fee23f9Smrg   struct gomp_work_share *ws = thr->ts.work_share;
1904fee23f9Smrg   gomp_ull start, end, nend, chunk;
1914fee23f9Smrg 
1924fee23f9Smrg   end = ws->end_ull;
1934fee23f9Smrg   chunk = ws->chunk_size_ull;
1944fee23f9Smrg 
1954fee23f9Smrg   if (__builtin_expect (ws->mode & 1, 1))
1964fee23f9Smrg     {
1974fee23f9Smrg       gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
1984fee23f9Smrg       if (__builtin_expect (ws->mode & 2, 0) == 0)
1994fee23f9Smrg 	{
2004fee23f9Smrg 	  if (tmp >= end)
2014fee23f9Smrg 	    return false;
2024fee23f9Smrg 	  nend = tmp + chunk;
2034fee23f9Smrg 	  if (nend > end)
2044fee23f9Smrg 	    nend = end;
2054fee23f9Smrg 	  *pstart = tmp;
2064fee23f9Smrg 	  *pend = nend;
2074fee23f9Smrg 	  return true;
2084fee23f9Smrg 	}
2094fee23f9Smrg       else
2104fee23f9Smrg 	{
2114fee23f9Smrg 	  if (tmp <= end)
2124fee23f9Smrg 	    return false;
2134fee23f9Smrg 	  nend = tmp + chunk;
2144fee23f9Smrg 	  if (nend < end)
2154fee23f9Smrg 	    nend = end;
2164fee23f9Smrg 	  *pstart = tmp;
2174fee23f9Smrg 	  *pend = nend;
2184fee23f9Smrg 	  return true;
2194fee23f9Smrg 	}
2204fee23f9Smrg     }
2214fee23f9Smrg 
2224d5abbe8Smrg   start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
2234fee23f9Smrg   while (1)
2244fee23f9Smrg     {
2254fee23f9Smrg       gomp_ull left = end - start;
2264fee23f9Smrg       gomp_ull tmp;
2274fee23f9Smrg 
2284fee23f9Smrg       if (start == end)
2294fee23f9Smrg 	return false;
2304fee23f9Smrg 
2314fee23f9Smrg       if (__builtin_expect (ws->mode & 2, 0))
2324fee23f9Smrg 	{
2334fee23f9Smrg 	  if (chunk < left)
2344fee23f9Smrg 	    chunk = left;
2354fee23f9Smrg 	}
2364fee23f9Smrg       else
2374fee23f9Smrg 	{
2384fee23f9Smrg 	  if (chunk > left)
2394fee23f9Smrg 	    chunk = left;
2404fee23f9Smrg 	}
2414fee23f9Smrg       nend = start + chunk;
2424fee23f9Smrg 
2434fee23f9Smrg       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
2444fee23f9Smrg       if (__builtin_expect (tmp == start, 1))
2454fee23f9Smrg 	break;
2464fee23f9Smrg 
2474fee23f9Smrg       start = tmp;
2484fee23f9Smrg     }
2494fee23f9Smrg 
2504fee23f9Smrg   *pstart = start;
2514fee23f9Smrg   *pend = nend;
2524fee23f9Smrg   return true;
2534fee23f9Smrg }
2544fee23f9Smrg #endif /* HAVE_SYNC_BUILTINS */
2554fee23f9Smrg 
2564fee23f9Smrg 
2574fee23f9Smrg /* This function implements the GUIDED scheduling method.  Arguments are
2584fee23f9Smrg    as for gomp_iter_ull_static_next.  This function must be called with the
2594fee23f9Smrg    work share lock held.  */
2604fee23f9Smrg 
2614fee23f9Smrg bool
gomp_iter_ull_guided_next_locked(gomp_ull * pstart,gomp_ull * pend)2624fee23f9Smrg gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
2634fee23f9Smrg {
2644fee23f9Smrg   struct gomp_thread *thr = gomp_thread ();
2654fee23f9Smrg   struct gomp_work_share *ws = thr->ts.work_share;
2664fee23f9Smrg   struct gomp_team *team = thr->ts.team;
2674fee23f9Smrg   gomp_ull nthreads = team ? team->nthreads : 1;
2684fee23f9Smrg   gomp_ull n, q;
2694fee23f9Smrg   gomp_ull start, end;
2704fee23f9Smrg 
2714fee23f9Smrg   if (ws->next_ull == ws->end_ull)
2724fee23f9Smrg     return false;
2734fee23f9Smrg 
2744fee23f9Smrg   start = ws->next_ull;
2754fee23f9Smrg   if (__builtin_expect (ws->mode, 0) == 0)
2764fee23f9Smrg     n = (ws->end_ull - start) / ws->incr_ull;
2774fee23f9Smrg   else
2784fee23f9Smrg     n = (start - ws->end_ull) / -ws->incr_ull;
2794fee23f9Smrg   q = (n + nthreads - 1) / nthreads;
2804fee23f9Smrg 
2814fee23f9Smrg   if (q < ws->chunk_size_ull)
2824fee23f9Smrg     q = ws->chunk_size_ull;
2834fee23f9Smrg   if (q <= n)
2844fee23f9Smrg     end = start + q * ws->incr_ull;
2854fee23f9Smrg   else
2864fee23f9Smrg     end = ws->end_ull;
2874fee23f9Smrg 
2884fee23f9Smrg   ws->next_ull = end;
2894fee23f9Smrg   *pstart = start;
2904fee23f9Smrg   *pend = end;
2914fee23f9Smrg   return true;
2924fee23f9Smrg }
2934fee23f9Smrg 
2944fee23f9Smrg #if defined HAVE_SYNC_BUILTINS && defined __LP64__
2954fee23f9Smrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
2964fee23f9Smrg    instead.  Note that the only memory value that changes is ws->next_ull.  */
2974fee23f9Smrg 
2984fee23f9Smrg bool
gomp_iter_ull_guided_next(gomp_ull * pstart,gomp_ull * pend)2994fee23f9Smrg gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
3004fee23f9Smrg {
3014fee23f9Smrg   struct gomp_thread *thr = gomp_thread ();
3024fee23f9Smrg   struct gomp_work_share *ws = thr->ts.work_share;
3034fee23f9Smrg   struct gomp_team *team = thr->ts.team;
3044fee23f9Smrg   gomp_ull nthreads = team ? team->nthreads : 1;
3054fee23f9Smrg   gomp_ull start, end, nend, incr;
3064fee23f9Smrg   gomp_ull chunk_size;
3074fee23f9Smrg 
3084d5abbe8Smrg   start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
3094fee23f9Smrg   end = ws->end_ull;
3104fee23f9Smrg   incr = ws->incr_ull;
3114fee23f9Smrg   chunk_size = ws->chunk_size_ull;
3124fee23f9Smrg 
3134fee23f9Smrg   while (1)
3144fee23f9Smrg     {
3154fee23f9Smrg       gomp_ull n, q;
3164fee23f9Smrg       gomp_ull tmp;
3174fee23f9Smrg 
3184fee23f9Smrg       if (start == end)
3194fee23f9Smrg 	return false;
3204fee23f9Smrg 
3214fee23f9Smrg       if (__builtin_expect (ws->mode, 0) == 0)
3224fee23f9Smrg 	n = (end - start) / incr;
3234fee23f9Smrg       else
3244fee23f9Smrg 	n = (start - end) / -incr;
3254fee23f9Smrg       q = (n + nthreads - 1) / nthreads;
3264fee23f9Smrg 
3274fee23f9Smrg       if (q < chunk_size)
3284fee23f9Smrg 	q = chunk_size;
3294fee23f9Smrg       if (__builtin_expect (q <= n, 1))
3304fee23f9Smrg 	nend = start + q * incr;
3314fee23f9Smrg       else
3324fee23f9Smrg 	nend = end;
3334fee23f9Smrg 
3344fee23f9Smrg       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
3354fee23f9Smrg       if (__builtin_expect (tmp == start, 1))
3364fee23f9Smrg 	break;
3374fee23f9Smrg 
3384fee23f9Smrg       start = tmp;
3394fee23f9Smrg     }
3404fee23f9Smrg 
3414fee23f9Smrg   *pstart = start;
3424fee23f9Smrg   *pend = nend;
3434fee23f9Smrg   return true;
3444fee23f9Smrg }
3454fee23f9Smrg #endif /* HAVE_SYNC_BUILTINS */
346