1*8feb0f0bSmrg /* Copyright (C) 2005-2020 Free Software Foundation, Inc.
21debfc3dSmrg Contributed by Richard Henderson <rth@redhat.com>.
31debfc3dSmrg
41debfc3dSmrg This file is part of the GNU Offloading and Multi Processing Library
51debfc3dSmrg (libgomp).
61debfc3dSmrg
71debfc3dSmrg Libgomp is free software; you can redistribute it and/or modify it
81debfc3dSmrg under the terms of the GNU General Public License as published by
91debfc3dSmrg the Free Software Foundation; either version 3, or (at your option)
101debfc3dSmrg any later version.
111debfc3dSmrg
121debfc3dSmrg Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
131debfc3dSmrg WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
141debfc3dSmrg FOR A PARTICULAR PURPOSE. See the GNU General Public License for
151debfc3dSmrg more details.
161debfc3dSmrg
171debfc3dSmrg Under Section 7 of GPL version 3, you are granted additional
181debfc3dSmrg permissions described in the GCC Runtime Library Exception, version
191debfc3dSmrg 3.1, as published by the Free Software Foundation.
201debfc3dSmrg
211debfc3dSmrg You should have received a copy of the GNU General Public License and
221debfc3dSmrg a copy of the GCC Runtime Library Exception along with this program;
231debfc3dSmrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
241debfc3dSmrg <http://www.gnu.org/licenses/>. */
251debfc3dSmrg
261debfc3dSmrg /* This file contains routines for managing work-share iteration, both
271debfc3dSmrg for loops and sections. */
281debfc3dSmrg
291debfc3dSmrg #include "libgomp.h"
301debfc3dSmrg #include <stdlib.h>
311debfc3dSmrg
321debfc3dSmrg typedef unsigned long long gomp_ull;
331debfc3dSmrg
341debfc3dSmrg /* This function implements the STATIC scheduling method. The caller should
351debfc3dSmrg iterate *pstart <= x < *pend. Return zero if there are more iterations
361debfc3dSmrg to perform; nonzero if not. Return less than 0 if this thread had
371debfc3dSmrg received the absolutely last iteration. */
381debfc3dSmrg
391debfc3dSmrg int
gomp_iter_ull_static_next(gomp_ull * pstart,gomp_ull * pend)401debfc3dSmrg gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
411debfc3dSmrg {
421debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
431debfc3dSmrg struct gomp_team *team = thr->ts.team;
441debfc3dSmrg struct gomp_work_share *ws = thr->ts.work_share;
451debfc3dSmrg unsigned long nthreads = team ? team->nthreads : 1;
461debfc3dSmrg
471debfc3dSmrg if (thr->ts.static_trip == -1)
481debfc3dSmrg return -1;
491debfc3dSmrg
501debfc3dSmrg /* Quick test for degenerate teams and orphaned constructs. */
511debfc3dSmrg if (nthreads == 1)
521debfc3dSmrg {
531debfc3dSmrg *pstart = ws->next_ull;
541debfc3dSmrg *pend = ws->end_ull;
551debfc3dSmrg thr->ts.static_trip = -1;
561debfc3dSmrg return ws->next_ull == ws->end_ull;
571debfc3dSmrg }
581debfc3dSmrg
591debfc3dSmrg /* We interpret chunk_size zero as "unspecified", which means that we
601debfc3dSmrg should break up the iterations such that each thread makes only one
611debfc3dSmrg trip through the outer loop. */
621debfc3dSmrg if (ws->chunk_size_ull == 0)
631debfc3dSmrg {
641debfc3dSmrg gomp_ull n, q, i, t, s0, e0, s, e;
651debfc3dSmrg
661debfc3dSmrg if (thr->ts.static_trip > 0)
671debfc3dSmrg return 1;
681debfc3dSmrg
691debfc3dSmrg /* Compute the total number of iterations. */
701debfc3dSmrg if (__builtin_expect (ws->mode, 0) == 0)
711debfc3dSmrg n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
721debfc3dSmrg else
731debfc3dSmrg n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
741debfc3dSmrg i = thr->ts.team_id;
751debfc3dSmrg
761debfc3dSmrg /* Compute the "zero-based" start and end points. That is, as
771debfc3dSmrg if the loop began at zero and incremented by one. */
781debfc3dSmrg q = n / nthreads;
791debfc3dSmrg t = n % nthreads;
801debfc3dSmrg if (i < t)
811debfc3dSmrg {
821debfc3dSmrg t = 0;
831debfc3dSmrg q++;
841debfc3dSmrg }
851debfc3dSmrg s0 = q * i + t;
861debfc3dSmrg e0 = s0 + q;
871debfc3dSmrg
881debfc3dSmrg /* Notice when no iterations allocated for this thread. */
891debfc3dSmrg if (s0 >= e0)
901debfc3dSmrg {
911debfc3dSmrg thr->ts.static_trip = 1;
921debfc3dSmrg return 1;
931debfc3dSmrg }
941debfc3dSmrg
951debfc3dSmrg /* Transform these to the actual start and end numbers. */
961debfc3dSmrg s = s0 * ws->incr_ull + ws->next_ull;
971debfc3dSmrg e = e0 * ws->incr_ull + ws->next_ull;
981debfc3dSmrg
991debfc3dSmrg *pstart = s;
1001debfc3dSmrg *pend = e;
1011debfc3dSmrg thr->ts.static_trip = (e0 == n ? -1 : 1);
1021debfc3dSmrg return 0;
1031debfc3dSmrg }
1041debfc3dSmrg else
1051debfc3dSmrg {
1061debfc3dSmrg gomp_ull n, s0, e0, i, c, s, e;
1071debfc3dSmrg
1081debfc3dSmrg /* Otherwise, each thread gets exactly chunk_size iterations
1091debfc3dSmrg (if available) each time through the loop. */
1101debfc3dSmrg
1111debfc3dSmrg if (__builtin_expect (ws->mode, 0) == 0)
1121debfc3dSmrg n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
1131debfc3dSmrg else
1141debfc3dSmrg n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
1151debfc3dSmrg i = thr->ts.team_id;
1161debfc3dSmrg c = ws->chunk_size_ull;
1171debfc3dSmrg
1181debfc3dSmrg /* Initial guess is a C sized chunk positioned nthreads iterations
1191debfc3dSmrg in, offset by our thread number. */
1201debfc3dSmrg s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
1211debfc3dSmrg e0 = s0 + c;
1221debfc3dSmrg
1231debfc3dSmrg /* Detect overflow. */
1241debfc3dSmrg if (s0 >= n)
1251debfc3dSmrg return 1;
1261debfc3dSmrg if (e0 > n)
1271debfc3dSmrg e0 = n;
1281debfc3dSmrg
1291debfc3dSmrg /* Transform these to the actual start and end numbers. */
1301debfc3dSmrg s = s0 * ws->incr_ull + ws->next_ull;
1311debfc3dSmrg e = e0 * ws->incr_ull + ws->next_ull;
1321debfc3dSmrg
1331debfc3dSmrg *pstart = s;
1341debfc3dSmrg *pend = e;
1351debfc3dSmrg
1361debfc3dSmrg if (e0 == n)
1371debfc3dSmrg thr->ts.static_trip = -1;
1381debfc3dSmrg else
1391debfc3dSmrg thr->ts.static_trip++;
1401debfc3dSmrg return 0;
1411debfc3dSmrg }
1421debfc3dSmrg }
1431debfc3dSmrg
1441debfc3dSmrg
1451debfc3dSmrg /* This function implements the DYNAMIC scheduling method. Arguments are
1461debfc3dSmrg as for gomp_iter_ull_static_next. This function must be called with
1471debfc3dSmrg ws->lock held. */
1481debfc3dSmrg
1491debfc3dSmrg bool
gomp_iter_ull_dynamic_next_locked(gomp_ull * pstart,gomp_ull * pend)1501debfc3dSmrg gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
1511debfc3dSmrg {
1521debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
1531debfc3dSmrg struct gomp_work_share *ws = thr->ts.work_share;
1541debfc3dSmrg gomp_ull start, end, chunk, left;
1551debfc3dSmrg
1561debfc3dSmrg start = ws->next_ull;
1571debfc3dSmrg if (start == ws->end_ull)
1581debfc3dSmrg return false;
1591debfc3dSmrg
1601debfc3dSmrg chunk = ws->chunk_size_ull;
1611debfc3dSmrg left = ws->end_ull - start;
1621debfc3dSmrg if (__builtin_expect (ws->mode & 2, 0))
1631debfc3dSmrg {
1641debfc3dSmrg if (chunk < left)
1651debfc3dSmrg chunk = left;
1661debfc3dSmrg }
1671debfc3dSmrg else
1681debfc3dSmrg {
1691debfc3dSmrg if (chunk > left)
1701debfc3dSmrg chunk = left;
1711debfc3dSmrg }
1721debfc3dSmrg end = start + chunk;
1731debfc3dSmrg
1741debfc3dSmrg ws->next_ull = end;
1751debfc3dSmrg *pstart = start;
1761debfc3dSmrg *pend = end;
1771debfc3dSmrg return true;
1781debfc3dSmrg }
1791debfc3dSmrg
1801debfc3dSmrg
1811debfc3dSmrg #if defined HAVE_SYNC_BUILTINS && defined __LP64__
1821debfc3dSmrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
1831debfc3dSmrg instead. Note that the only memory value that changes is ws->next_ull. */
1841debfc3dSmrg
1851debfc3dSmrg bool
gomp_iter_ull_dynamic_next(gomp_ull * pstart,gomp_ull * pend)1861debfc3dSmrg gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
1871debfc3dSmrg {
1881debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
1891debfc3dSmrg struct gomp_work_share *ws = thr->ts.work_share;
1901debfc3dSmrg gomp_ull start, end, nend, chunk;
1911debfc3dSmrg
1921debfc3dSmrg end = ws->end_ull;
1931debfc3dSmrg chunk = ws->chunk_size_ull;
1941debfc3dSmrg
1951debfc3dSmrg if (__builtin_expect (ws->mode & 1, 1))
1961debfc3dSmrg {
1971debfc3dSmrg gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
1981debfc3dSmrg if (__builtin_expect (ws->mode & 2, 0) == 0)
1991debfc3dSmrg {
2001debfc3dSmrg if (tmp >= end)
2011debfc3dSmrg return false;
2021debfc3dSmrg nend = tmp + chunk;
2031debfc3dSmrg if (nend > end)
2041debfc3dSmrg nend = end;
2051debfc3dSmrg *pstart = tmp;
2061debfc3dSmrg *pend = nend;
2071debfc3dSmrg return true;
2081debfc3dSmrg }
2091debfc3dSmrg else
2101debfc3dSmrg {
2111debfc3dSmrg if (tmp <= end)
2121debfc3dSmrg return false;
2131debfc3dSmrg nend = tmp + chunk;
2141debfc3dSmrg if (nend < end)
2151debfc3dSmrg nend = end;
2161debfc3dSmrg *pstart = tmp;
2171debfc3dSmrg *pend = nend;
2181debfc3dSmrg return true;
2191debfc3dSmrg }
2201debfc3dSmrg }
2211debfc3dSmrg
2221debfc3dSmrg start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
2231debfc3dSmrg while (1)
2241debfc3dSmrg {
2251debfc3dSmrg gomp_ull left = end - start;
2261debfc3dSmrg gomp_ull tmp;
2271debfc3dSmrg
2281debfc3dSmrg if (start == end)
2291debfc3dSmrg return false;
2301debfc3dSmrg
2311debfc3dSmrg if (__builtin_expect (ws->mode & 2, 0))
2321debfc3dSmrg {
2331debfc3dSmrg if (chunk < left)
2341debfc3dSmrg chunk = left;
2351debfc3dSmrg }
2361debfc3dSmrg else
2371debfc3dSmrg {
2381debfc3dSmrg if (chunk > left)
2391debfc3dSmrg chunk = left;
2401debfc3dSmrg }
2411debfc3dSmrg nend = start + chunk;
2421debfc3dSmrg
2431debfc3dSmrg tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
2441debfc3dSmrg if (__builtin_expect (tmp == start, 1))
2451debfc3dSmrg break;
2461debfc3dSmrg
2471debfc3dSmrg start = tmp;
2481debfc3dSmrg }
2491debfc3dSmrg
2501debfc3dSmrg *pstart = start;
2511debfc3dSmrg *pend = nend;
2521debfc3dSmrg return true;
2531debfc3dSmrg }
2541debfc3dSmrg #endif /* HAVE_SYNC_BUILTINS */
2551debfc3dSmrg
2561debfc3dSmrg
2571debfc3dSmrg /* This function implements the GUIDED scheduling method. Arguments are
2581debfc3dSmrg as for gomp_iter_ull_static_next. This function must be called with the
2591debfc3dSmrg work share lock held. */
2601debfc3dSmrg
2611debfc3dSmrg bool
gomp_iter_ull_guided_next_locked(gomp_ull * pstart,gomp_ull * pend)2621debfc3dSmrg gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
2631debfc3dSmrg {
2641debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
2651debfc3dSmrg struct gomp_work_share *ws = thr->ts.work_share;
2661debfc3dSmrg struct gomp_team *team = thr->ts.team;
2671debfc3dSmrg gomp_ull nthreads = team ? team->nthreads : 1;
2681debfc3dSmrg gomp_ull n, q;
2691debfc3dSmrg gomp_ull start, end;
2701debfc3dSmrg
2711debfc3dSmrg if (ws->next_ull == ws->end_ull)
2721debfc3dSmrg return false;
2731debfc3dSmrg
2741debfc3dSmrg start = ws->next_ull;
2751debfc3dSmrg if (__builtin_expect (ws->mode, 0) == 0)
2761debfc3dSmrg n = (ws->end_ull - start) / ws->incr_ull;
2771debfc3dSmrg else
2781debfc3dSmrg n = (start - ws->end_ull) / -ws->incr_ull;
2791debfc3dSmrg q = (n + nthreads - 1) / nthreads;
2801debfc3dSmrg
2811debfc3dSmrg if (q < ws->chunk_size_ull)
2821debfc3dSmrg q = ws->chunk_size_ull;
2831debfc3dSmrg if (q <= n)
2841debfc3dSmrg end = start + q * ws->incr_ull;
2851debfc3dSmrg else
2861debfc3dSmrg end = ws->end_ull;
2871debfc3dSmrg
2881debfc3dSmrg ws->next_ull = end;
2891debfc3dSmrg *pstart = start;
2901debfc3dSmrg *pend = end;
2911debfc3dSmrg return true;
2921debfc3dSmrg }
2931debfc3dSmrg
2941debfc3dSmrg #if defined HAVE_SYNC_BUILTINS && defined __LP64__
2951debfc3dSmrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
2961debfc3dSmrg instead. Note that the only memory value that changes is ws->next_ull. */
2971debfc3dSmrg
2981debfc3dSmrg bool
gomp_iter_ull_guided_next(gomp_ull * pstart,gomp_ull * pend)2991debfc3dSmrg gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
3001debfc3dSmrg {
3011debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
3021debfc3dSmrg struct gomp_work_share *ws = thr->ts.work_share;
3031debfc3dSmrg struct gomp_team *team = thr->ts.team;
3041debfc3dSmrg gomp_ull nthreads = team ? team->nthreads : 1;
3051debfc3dSmrg gomp_ull start, end, nend, incr;
3061debfc3dSmrg gomp_ull chunk_size;
3071debfc3dSmrg
3081debfc3dSmrg start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
3091debfc3dSmrg end = ws->end_ull;
3101debfc3dSmrg incr = ws->incr_ull;
3111debfc3dSmrg chunk_size = ws->chunk_size_ull;
3121debfc3dSmrg
3131debfc3dSmrg while (1)
3141debfc3dSmrg {
3151debfc3dSmrg gomp_ull n, q;
3161debfc3dSmrg gomp_ull tmp;
3171debfc3dSmrg
3181debfc3dSmrg if (start == end)
3191debfc3dSmrg return false;
3201debfc3dSmrg
3211debfc3dSmrg if (__builtin_expect (ws->mode, 0) == 0)
3221debfc3dSmrg n = (end - start) / incr;
3231debfc3dSmrg else
3241debfc3dSmrg n = (start - end) / -incr;
3251debfc3dSmrg q = (n + nthreads - 1) / nthreads;
3261debfc3dSmrg
3271debfc3dSmrg if (q < chunk_size)
3281debfc3dSmrg q = chunk_size;
3291debfc3dSmrg if (__builtin_expect (q <= n, 1))
3301debfc3dSmrg nend = start + q * incr;
3311debfc3dSmrg else
3321debfc3dSmrg nend = end;
3331debfc3dSmrg
3341debfc3dSmrg tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
3351debfc3dSmrg if (__builtin_expect (tmp == start, 1))
3361debfc3dSmrg break;
3371debfc3dSmrg
3381debfc3dSmrg start = tmp;
3391debfc3dSmrg }
3401debfc3dSmrg
3411debfc3dSmrg *pstart = start;
3421debfc3dSmrg *pend = nend;
3431debfc3dSmrg return true;
3441debfc3dSmrg }
3451debfc3dSmrg #endif /* HAVE_SYNC_BUILTINS */
346