1*8feb0f0bSmrg /* Copyright (C) 2005-2020 Free Software Foundation, Inc.
21debfc3dSmrg Contributed by Richard Henderson <rth@redhat.com>.
31debfc3dSmrg
41debfc3dSmrg This file is part of the GNU Offloading and Multi Processing Library
51debfc3dSmrg (libgomp).
61debfc3dSmrg
71debfc3dSmrg Libgomp is free software; you can redistribute it and/or modify it
81debfc3dSmrg under the terms of the GNU General Public License as published by
91debfc3dSmrg the Free Software Foundation; either version 3, or (at your option)
101debfc3dSmrg any later version.
111debfc3dSmrg
121debfc3dSmrg Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
131debfc3dSmrg WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
141debfc3dSmrg FOR A PARTICULAR PURPOSE. See the GNU General Public License for
151debfc3dSmrg more details.
161debfc3dSmrg
171debfc3dSmrg Under Section 7 of GPL version 3, you are granted additional
181debfc3dSmrg permissions described in the GCC Runtime Library Exception, version
191debfc3dSmrg 3.1, as published by the Free Software Foundation.
201debfc3dSmrg
211debfc3dSmrg You should have received a copy of the GNU General Public License and
221debfc3dSmrg a copy of the GCC Runtime Library Exception along with this program;
231debfc3dSmrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
241debfc3dSmrg <http://www.gnu.org/licenses/>. */
251debfc3dSmrg
261debfc3dSmrg /* This file contains routines for managing work-share iteration, both
271debfc3dSmrg for loops and sections. */
281debfc3dSmrg
291debfc3dSmrg #include "libgomp.h"
301debfc3dSmrg #include <stdlib.h>
311debfc3dSmrg
321debfc3dSmrg
331debfc3dSmrg /* This function implements the STATIC scheduling method. The caller should
341debfc3dSmrg iterate *pstart <= x < *pend. Return zero if there are more iterations
351debfc3dSmrg to perform; nonzero if not. Return less than 0 if this thread had
361debfc3dSmrg received the absolutely last iteration. */
371debfc3dSmrg
381debfc3dSmrg int
gomp_iter_static_next(long * pstart,long * pend)391debfc3dSmrg gomp_iter_static_next (long *pstart, long *pend)
401debfc3dSmrg {
411debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
421debfc3dSmrg struct gomp_team *team = thr->ts.team;
431debfc3dSmrg struct gomp_work_share *ws = thr->ts.work_share;
441debfc3dSmrg unsigned long nthreads = team ? team->nthreads : 1;
451debfc3dSmrg
461debfc3dSmrg if (thr->ts.static_trip == -1)
471debfc3dSmrg return -1;
481debfc3dSmrg
491debfc3dSmrg /* Quick test for degenerate teams and orphaned constructs. */
501debfc3dSmrg if (nthreads == 1)
511debfc3dSmrg {
521debfc3dSmrg *pstart = ws->next;
531debfc3dSmrg *pend = ws->end;
541debfc3dSmrg thr->ts.static_trip = -1;
551debfc3dSmrg return ws->next == ws->end;
561debfc3dSmrg }
571debfc3dSmrg
581debfc3dSmrg /* We interpret chunk_size zero as "unspecified", which means that we
591debfc3dSmrg should break up the iterations such that each thread makes only one
601debfc3dSmrg trip through the outer loop. */
611debfc3dSmrg if (ws->chunk_size == 0)
621debfc3dSmrg {
631debfc3dSmrg unsigned long n, q, i, t;
641debfc3dSmrg unsigned long s0, e0;
651debfc3dSmrg long s, e;
661debfc3dSmrg
671debfc3dSmrg if (thr->ts.static_trip > 0)
681debfc3dSmrg return 1;
691debfc3dSmrg
701debfc3dSmrg /* Compute the total number of iterations. */
711debfc3dSmrg s = ws->incr + (ws->incr > 0 ? -1 : 1);
721debfc3dSmrg n = (ws->end - ws->next + s) / ws->incr;
731debfc3dSmrg i = thr->ts.team_id;
741debfc3dSmrg
751debfc3dSmrg /* Compute the "zero-based" start and end points. That is, as
761debfc3dSmrg if the loop began at zero and incremented by one. */
771debfc3dSmrg q = n / nthreads;
781debfc3dSmrg t = n % nthreads;
791debfc3dSmrg if (i < t)
801debfc3dSmrg {
811debfc3dSmrg t = 0;
821debfc3dSmrg q++;
831debfc3dSmrg }
841debfc3dSmrg s0 = q * i + t;
851debfc3dSmrg e0 = s0 + q;
861debfc3dSmrg
871debfc3dSmrg /* Notice when no iterations allocated for this thread. */
881debfc3dSmrg if (s0 >= e0)
891debfc3dSmrg {
901debfc3dSmrg thr->ts.static_trip = 1;
911debfc3dSmrg return 1;
921debfc3dSmrg }
931debfc3dSmrg
941debfc3dSmrg /* Transform these to the actual start and end numbers. */
951debfc3dSmrg s = (long)s0 * ws->incr + ws->next;
961debfc3dSmrg e = (long)e0 * ws->incr + ws->next;
971debfc3dSmrg
981debfc3dSmrg *pstart = s;
991debfc3dSmrg *pend = e;
1001debfc3dSmrg thr->ts.static_trip = (e0 == n ? -1 : 1);
1011debfc3dSmrg return 0;
1021debfc3dSmrg }
1031debfc3dSmrg else
1041debfc3dSmrg {
1051debfc3dSmrg unsigned long n, s0, e0, i, c;
1061debfc3dSmrg long s, e;
1071debfc3dSmrg
1081debfc3dSmrg /* Otherwise, each thread gets exactly chunk_size iterations
1091debfc3dSmrg (if available) each time through the loop. */
1101debfc3dSmrg
1111debfc3dSmrg s = ws->incr + (ws->incr > 0 ? -1 : 1);
1121debfc3dSmrg n = (ws->end - ws->next + s) / ws->incr;
1131debfc3dSmrg i = thr->ts.team_id;
1141debfc3dSmrg c = ws->chunk_size;
1151debfc3dSmrg
1161debfc3dSmrg /* Initial guess is a C sized chunk positioned nthreads iterations
1171debfc3dSmrg in, offset by our thread number. */
1181debfc3dSmrg s0 = (thr->ts.static_trip * nthreads + i) * c;
1191debfc3dSmrg e0 = s0 + c;
1201debfc3dSmrg
1211debfc3dSmrg /* Detect overflow. */
1221debfc3dSmrg if (s0 >= n)
1231debfc3dSmrg return 1;
1241debfc3dSmrg if (e0 > n)
1251debfc3dSmrg e0 = n;
1261debfc3dSmrg
1271debfc3dSmrg /* Transform these to the actual start and end numbers. */
1281debfc3dSmrg s = (long)s0 * ws->incr + ws->next;
1291debfc3dSmrg e = (long)e0 * ws->incr + ws->next;
1301debfc3dSmrg
1311debfc3dSmrg *pstart = s;
1321debfc3dSmrg *pend = e;
1331debfc3dSmrg
1341debfc3dSmrg if (e0 == n)
1351debfc3dSmrg thr->ts.static_trip = -1;
1361debfc3dSmrg else
1371debfc3dSmrg thr->ts.static_trip++;
1381debfc3dSmrg return 0;
1391debfc3dSmrg }
1401debfc3dSmrg }
1411debfc3dSmrg
1421debfc3dSmrg
1431debfc3dSmrg /* This function implements the DYNAMIC scheduling method. Arguments are
1441debfc3dSmrg as for gomp_iter_static_next. This function must be called with ws->lock
1451debfc3dSmrg held. */
1461debfc3dSmrg
1471debfc3dSmrg bool
gomp_iter_dynamic_next_locked(long * pstart,long * pend)1481debfc3dSmrg gomp_iter_dynamic_next_locked (long *pstart, long *pend)
1491debfc3dSmrg {
1501debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
1511debfc3dSmrg struct gomp_work_share *ws = thr->ts.work_share;
1521debfc3dSmrg long start, end, chunk, left;
1531debfc3dSmrg
1541debfc3dSmrg start = ws->next;
1551debfc3dSmrg if (start == ws->end)
1561debfc3dSmrg return false;
1571debfc3dSmrg
1581debfc3dSmrg chunk = ws->chunk_size;
1591debfc3dSmrg left = ws->end - start;
1601debfc3dSmrg if (ws->incr < 0)
1611debfc3dSmrg {
1621debfc3dSmrg if (chunk < left)
1631debfc3dSmrg chunk = left;
1641debfc3dSmrg }
1651debfc3dSmrg else
1661debfc3dSmrg {
1671debfc3dSmrg if (chunk > left)
1681debfc3dSmrg chunk = left;
1691debfc3dSmrg }
1701debfc3dSmrg end = start + chunk;
1711debfc3dSmrg
1721debfc3dSmrg ws->next = end;
1731debfc3dSmrg *pstart = start;
1741debfc3dSmrg *pend = end;
1751debfc3dSmrg return true;
1761debfc3dSmrg }
1771debfc3dSmrg
1781debfc3dSmrg
1791debfc3dSmrg #ifdef HAVE_SYNC_BUILTINS
1801debfc3dSmrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
1811debfc3dSmrg instead. Note that the only memory value that changes is ws->next. */
1821debfc3dSmrg
1831debfc3dSmrg bool
gomp_iter_dynamic_next(long * pstart,long * pend)1841debfc3dSmrg gomp_iter_dynamic_next (long *pstart, long *pend)
1851debfc3dSmrg {
1861debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
1871debfc3dSmrg struct gomp_work_share *ws = thr->ts.work_share;
1881debfc3dSmrg long start, end, nend, chunk, incr;
1891debfc3dSmrg
1901debfc3dSmrg end = ws->end;
1911debfc3dSmrg incr = ws->incr;
1921debfc3dSmrg chunk = ws->chunk_size;
1931debfc3dSmrg
1941debfc3dSmrg if (__builtin_expect (ws->mode, 1))
1951debfc3dSmrg {
1961debfc3dSmrg long tmp = __sync_fetch_and_add (&ws->next, chunk);
1971debfc3dSmrg if (incr > 0)
1981debfc3dSmrg {
1991debfc3dSmrg if (tmp >= end)
2001debfc3dSmrg return false;
2011debfc3dSmrg nend = tmp + chunk;
2021debfc3dSmrg if (nend > end)
2031debfc3dSmrg nend = end;
2041debfc3dSmrg *pstart = tmp;
2051debfc3dSmrg *pend = nend;
2061debfc3dSmrg return true;
2071debfc3dSmrg }
2081debfc3dSmrg else
2091debfc3dSmrg {
2101debfc3dSmrg if (tmp <= end)
2111debfc3dSmrg return false;
2121debfc3dSmrg nend = tmp + chunk;
2131debfc3dSmrg if (nend < end)
2141debfc3dSmrg nend = end;
2151debfc3dSmrg *pstart = tmp;
2161debfc3dSmrg *pend = nend;
2171debfc3dSmrg return true;
2181debfc3dSmrg }
2191debfc3dSmrg }
2201debfc3dSmrg
2211debfc3dSmrg start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
2221debfc3dSmrg while (1)
2231debfc3dSmrg {
2241debfc3dSmrg long left = end - start;
2251debfc3dSmrg long tmp;
2261debfc3dSmrg
2271debfc3dSmrg if (start == end)
2281debfc3dSmrg return false;
2291debfc3dSmrg
2301debfc3dSmrg if (incr < 0)
2311debfc3dSmrg {
2321debfc3dSmrg if (chunk < left)
2331debfc3dSmrg chunk = left;
2341debfc3dSmrg }
2351debfc3dSmrg else
2361debfc3dSmrg {
2371debfc3dSmrg if (chunk > left)
2381debfc3dSmrg chunk = left;
2391debfc3dSmrg }
2401debfc3dSmrg nend = start + chunk;
2411debfc3dSmrg
2421debfc3dSmrg tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
2431debfc3dSmrg if (__builtin_expect (tmp == start, 1))
2441debfc3dSmrg break;
2451debfc3dSmrg
2461debfc3dSmrg start = tmp;
2471debfc3dSmrg }
2481debfc3dSmrg
2491debfc3dSmrg *pstart = start;
2501debfc3dSmrg *pend = nend;
2511debfc3dSmrg return true;
2521debfc3dSmrg }
2531debfc3dSmrg #endif /* HAVE_SYNC_BUILTINS */
2541debfc3dSmrg
2551debfc3dSmrg
2561debfc3dSmrg /* This function implements the GUIDED scheduling method. Arguments are
2571debfc3dSmrg as for gomp_iter_static_next. This function must be called with the
2581debfc3dSmrg work share lock held. */
2591debfc3dSmrg
2601debfc3dSmrg bool
gomp_iter_guided_next_locked(long * pstart,long * pend)2611debfc3dSmrg gomp_iter_guided_next_locked (long *pstart, long *pend)
2621debfc3dSmrg {
2631debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
2641debfc3dSmrg struct gomp_work_share *ws = thr->ts.work_share;
2651debfc3dSmrg struct gomp_team *team = thr->ts.team;
2661debfc3dSmrg unsigned long nthreads = team ? team->nthreads : 1;
2671debfc3dSmrg unsigned long n, q;
2681debfc3dSmrg long start, end;
2691debfc3dSmrg
2701debfc3dSmrg if (ws->next == ws->end)
2711debfc3dSmrg return false;
2721debfc3dSmrg
2731debfc3dSmrg start = ws->next;
2741debfc3dSmrg n = (ws->end - start) / ws->incr;
2751debfc3dSmrg q = (n + nthreads - 1) / nthreads;
2761debfc3dSmrg
2771debfc3dSmrg if (q < ws->chunk_size)
2781debfc3dSmrg q = ws->chunk_size;
2791debfc3dSmrg if (q <= n)
2801debfc3dSmrg end = start + q * ws->incr;
2811debfc3dSmrg else
2821debfc3dSmrg end = ws->end;
2831debfc3dSmrg
2841debfc3dSmrg ws->next = end;
2851debfc3dSmrg *pstart = start;
2861debfc3dSmrg *pend = end;
2871debfc3dSmrg return true;
2881debfc3dSmrg }
2891debfc3dSmrg
2901debfc3dSmrg #ifdef HAVE_SYNC_BUILTINS
2911debfc3dSmrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
2921debfc3dSmrg instead. Note that the only memory value that changes is ws->next. */
2931debfc3dSmrg
2941debfc3dSmrg bool
gomp_iter_guided_next(long * pstart,long * pend)2951debfc3dSmrg gomp_iter_guided_next (long *pstart, long *pend)
2961debfc3dSmrg {
2971debfc3dSmrg struct gomp_thread *thr = gomp_thread ();
2981debfc3dSmrg struct gomp_work_share *ws = thr->ts.work_share;
2991debfc3dSmrg struct gomp_team *team = thr->ts.team;
3001debfc3dSmrg unsigned long nthreads = team ? team->nthreads : 1;
3011debfc3dSmrg long start, end, nend, incr;
3021debfc3dSmrg unsigned long chunk_size;
3031debfc3dSmrg
3041debfc3dSmrg start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
3051debfc3dSmrg end = ws->end;
3061debfc3dSmrg incr = ws->incr;
3071debfc3dSmrg chunk_size = ws->chunk_size;
3081debfc3dSmrg
3091debfc3dSmrg while (1)
3101debfc3dSmrg {
3111debfc3dSmrg unsigned long n, q;
3121debfc3dSmrg long tmp;
3131debfc3dSmrg
3141debfc3dSmrg if (start == end)
3151debfc3dSmrg return false;
3161debfc3dSmrg
3171debfc3dSmrg n = (end - start) / incr;
3181debfc3dSmrg q = (n + nthreads - 1) / nthreads;
3191debfc3dSmrg
3201debfc3dSmrg if (q < chunk_size)
3211debfc3dSmrg q = chunk_size;
3221debfc3dSmrg if (__builtin_expect (q <= n, 1))
3231debfc3dSmrg nend = start + q * incr;
3241debfc3dSmrg else
3251debfc3dSmrg nend = end;
3261debfc3dSmrg
3271debfc3dSmrg tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
3281debfc3dSmrg if (__builtin_expect (tmp == start, 1))
3291debfc3dSmrg break;
3301debfc3dSmrg
3311debfc3dSmrg start = tmp;
3321debfc3dSmrg }
3331debfc3dSmrg
3341debfc3dSmrg *pstart = start;
3351debfc3dSmrg *pend = nend;
3361debfc3dSmrg return true;
3371debfc3dSmrg }
3381debfc3dSmrg #endif /* HAVE_SYNC_BUILTINS */
339