1*b1e83836Smrg /* Copyright (C) 2005-2022 Free Software Foundation, Inc.
24fee23f9Smrg Contributed by Richard Henderson <rth@redhat.com>.
34fee23f9Smrg
44d5abbe8Smrg This file is part of the GNU Offloading and Multi Processing Library
54d5abbe8Smrg (libgomp).
64fee23f9Smrg
74fee23f9Smrg Libgomp is free software; you can redistribute it and/or modify it
84fee23f9Smrg under the terms of the GNU General Public License as published by
94fee23f9Smrg the Free Software Foundation; either version 3, or (at your option)
104fee23f9Smrg any later version.
114fee23f9Smrg
124fee23f9Smrg Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
134fee23f9Smrg WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
144fee23f9Smrg FOR A PARTICULAR PURPOSE. See the GNU General Public License for
154fee23f9Smrg more details.
164fee23f9Smrg
174fee23f9Smrg Under Section 7 of GPL version 3, you are granted additional
184fee23f9Smrg permissions described in the GCC Runtime Library Exception, version
194fee23f9Smrg 3.1, as published by the Free Software Foundation.
204fee23f9Smrg
214fee23f9Smrg You should have received a copy of the GNU General Public License and
224fee23f9Smrg a copy of the GCC Runtime Library Exception along with this program;
234fee23f9Smrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
244fee23f9Smrg <http://www.gnu.org/licenses/>. */
254fee23f9Smrg
264fee23f9Smrg /* This file contains routines for managing work-share iteration, both
274fee23f9Smrg for loops and sections. */
284fee23f9Smrg
294fee23f9Smrg #include "libgomp.h"
304fee23f9Smrg #include <stdlib.h>
314fee23f9Smrg
324fee23f9Smrg typedef unsigned long long gomp_ull;
334fee23f9Smrg
344fee23f9Smrg /* This function implements the STATIC scheduling method. The caller should
354fee23f9Smrg iterate *pstart <= x < *pend. Return zero if there are more iterations
364fee23f9Smrg to perform; nonzero if not. Return less than 0 if this thread had
374fee23f9Smrg received the absolutely last iteration. */
384fee23f9Smrg
394fee23f9Smrg int
gomp_iter_ull_static_next(gomp_ull * pstart,gomp_ull * pend)404fee23f9Smrg gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
414fee23f9Smrg {
424fee23f9Smrg struct gomp_thread *thr = gomp_thread ();
434fee23f9Smrg struct gomp_team *team = thr->ts.team;
444fee23f9Smrg struct gomp_work_share *ws = thr->ts.work_share;
454fee23f9Smrg unsigned long nthreads = team ? team->nthreads : 1;
464fee23f9Smrg
474fee23f9Smrg if (thr->ts.static_trip == -1)
484fee23f9Smrg return -1;
494fee23f9Smrg
504fee23f9Smrg /* Quick test for degenerate teams and orphaned constructs. */
514fee23f9Smrg if (nthreads == 1)
524fee23f9Smrg {
534fee23f9Smrg *pstart = ws->next_ull;
544fee23f9Smrg *pend = ws->end_ull;
554fee23f9Smrg thr->ts.static_trip = -1;
564fee23f9Smrg return ws->next_ull == ws->end_ull;
574fee23f9Smrg }
584fee23f9Smrg
594fee23f9Smrg /* We interpret chunk_size zero as "unspecified", which means that we
604fee23f9Smrg should break up the iterations such that each thread makes only one
614fee23f9Smrg trip through the outer loop. */
624fee23f9Smrg if (ws->chunk_size_ull == 0)
634fee23f9Smrg {
6448fb7bfaSmrg gomp_ull n, q, i, t, s0, e0, s, e;
654fee23f9Smrg
664fee23f9Smrg if (thr->ts.static_trip > 0)
674fee23f9Smrg return 1;
684fee23f9Smrg
694fee23f9Smrg /* Compute the total number of iterations. */
704fee23f9Smrg if (__builtin_expect (ws->mode, 0) == 0)
714fee23f9Smrg n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
724fee23f9Smrg else
734fee23f9Smrg n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
744fee23f9Smrg i = thr->ts.team_id;
754fee23f9Smrg
764fee23f9Smrg /* Compute the "zero-based" start and end points. That is, as
774fee23f9Smrg if the loop began at zero and incremented by one. */
784fee23f9Smrg q = n / nthreads;
7948fb7bfaSmrg t = n % nthreads;
8048fb7bfaSmrg if (i < t)
8148fb7bfaSmrg {
8248fb7bfaSmrg t = 0;
8348fb7bfaSmrg q++;
8448fb7bfaSmrg }
8548fb7bfaSmrg s0 = q * i + t;
864fee23f9Smrg e0 = s0 + q;
874fee23f9Smrg
884fee23f9Smrg /* Notice when no iterations allocated for this thread. */
894fee23f9Smrg if (s0 >= e0)
904fee23f9Smrg {
914fee23f9Smrg thr->ts.static_trip = 1;
924fee23f9Smrg return 1;
934fee23f9Smrg }
944fee23f9Smrg
954fee23f9Smrg /* Transform these to the actual start and end numbers. */
964fee23f9Smrg s = s0 * ws->incr_ull + ws->next_ull;
974fee23f9Smrg e = e0 * ws->incr_ull + ws->next_ull;
984fee23f9Smrg
994fee23f9Smrg *pstart = s;
1004fee23f9Smrg *pend = e;
1014fee23f9Smrg thr->ts.static_trip = (e0 == n ? -1 : 1);
1024fee23f9Smrg return 0;
1034fee23f9Smrg }
1044fee23f9Smrg else
1054fee23f9Smrg {
1064fee23f9Smrg gomp_ull n, s0, e0, i, c, s, e;
1074fee23f9Smrg
1084fee23f9Smrg /* Otherwise, each thread gets exactly chunk_size iterations
1094fee23f9Smrg (if available) each time through the loop. */
1104fee23f9Smrg
1114fee23f9Smrg if (__builtin_expect (ws->mode, 0) == 0)
1124fee23f9Smrg n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
1134fee23f9Smrg else
1144fee23f9Smrg n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
1154fee23f9Smrg i = thr->ts.team_id;
1164fee23f9Smrg c = ws->chunk_size_ull;
1174fee23f9Smrg
1184fee23f9Smrg /* Initial guess is a C sized chunk positioned nthreads iterations
1194fee23f9Smrg in, offset by our thread number. */
1204fee23f9Smrg s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
1214fee23f9Smrg e0 = s0 + c;
1224fee23f9Smrg
1234fee23f9Smrg /* Detect overflow. */
1244fee23f9Smrg if (s0 >= n)
1254fee23f9Smrg return 1;
1264fee23f9Smrg if (e0 > n)
1274fee23f9Smrg e0 = n;
1284fee23f9Smrg
1294fee23f9Smrg /* Transform these to the actual start and end numbers. */
1304fee23f9Smrg s = s0 * ws->incr_ull + ws->next_ull;
1314fee23f9Smrg e = e0 * ws->incr_ull + ws->next_ull;
1324fee23f9Smrg
1334fee23f9Smrg *pstart = s;
1344fee23f9Smrg *pend = e;
1354fee23f9Smrg
1364fee23f9Smrg if (e0 == n)
1374fee23f9Smrg thr->ts.static_trip = -1;
1384fee23f9Smrg else
1394fee23f9Smrg thr->ts.static_trip++;
1404fee23f9Smrg return 0;
1414fee23f9Smrg }
1424fee23f9Smrg }
1434fee23f9Smrg
1444fee23f9Smrg
1454fee23f9Smrg /* This function implements the DYNAMIC scheduling method. Arguments are
1464fee23f9Smrg as for gomp_iter_ull_static_next. This function must be called with
1474fee23f9Smrg ws->lock held. */
1484fee23f9Smrg
1494fee23f9Smrg bool
gomp_iter_ull_dynamic_next_locked(gomp_ull * pstart,gomp_ull * pend)1504fee23f9Smrg gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
1514fee23f9Smrg {
1524fee23f9Smrg struct gomp_thread *thr = gomp_thread ();
1534fee23f9Smrg struct gomp_work_share *ws = thr->ts.work_share;
1544fee23f9Smrg gomp_ull start, end, chunk, left;
1554fee23f9Smrg
1564fee23f9Smrg start = ws->next_ull;
1574fee23f9Smrg if (start == ws->end_ull)
1584fee23f9Smrg return false;
1594fee23f9Smrg
1604fee23f9Smrg chunk = ws->chunk_size_ull;
1614fee23f9Smrg left = ws->end_ull - start;
1624fee23f9Smrg if (__builtin_expect (ws->mode & 2, 0))
1634fee23f9Smrg {
1644fee23f9Smrg if (chunk < left)
1654fee23f9Smrg chunk = left;
1664fee23f9Smrg }
1674fee23f9Smrg else
1684fee23f9Smrg {
1694fee23f9Smrg if (chunk > left)
1704fee23f9Smrg chunk = left;
1714fee23f9Smrg }
1724fee23f9Smrg end = start + chunk;
1734fee23f9Smrg
1744fee23f9Smrg ws->next_ull = end;
1754fee23f9Smrg *pstart = start;
1764fee23f9Smrg *pend = end;
1774fee23f9Smrg return true;
1784fee23f9Smrg }
1794fee23f9Smrg
1804fee23f9Smrg
1814fee23f9Smrg #if defined HAVE_SYNC_BUILTINS && defined __LP64__
1824fee23f9Smrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
1834fee23f9Smrg instead. Note that the only memory value that changes is ws->next_ull. */
1844fee23f9Smrg
1854fee23f9Smrg bool
gomp_iter_ull_dynamic_next(gomp_ull * pstart,gomp_ull * pend)1864fee23f9Smrg gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
1874fee23f9Smrg {
1884fee23f9Smrg struct gomp_thread *thr = gomp_thread ();
1894fee23f9Smrg struct gomp_work_share *ws = thr->ts.work_share;
1904fee23f9Smrg gomp_ull start, end, nend, chunk;
1914fee23f9Smrg
1924fee23f9Smrg end = ws->end_ull;
1934fee23f9Smrg chunk = ws->chunk_size_ull;
1944fee23f9Smrg
1954fee23f9Smrg if (__builtin_expect (ws->mode & 1, 1))
1964fee23f9Smrg {
1974fee23f9Smrg gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
1984fee23f9Smrg if (__builtin_expect (ws->mode & 2, 0) == 0)
1994fee23f9Smrg {
2004fee23f9Smrg if (tmp >= end)
2014fee23f9Smrg return false;
2024fee23f9Smrg nend = tmp + chunk;
2034fee23f9Smrg if (nend > end)
2044fee23f9Smrg nend = end;
2054fee23f9Smrg *pstart = tmp;
2064fee23f9Smrg *pend = nend;
2074fee23f9Smrg return true;
2084fee23f9Smrg }
2094fee23f9Smrg else
2104fee23f9Smrg {
2114fee23f9Smrg if (tmp <= end)
2124fee23f9Smrg return false;
2134fee23f9Smrg nend = tmp + chunk;
2144fee23f9Smrg if (nend < end)
2154fee23f9Smrg nend = end;
2164fee23f9Smrg *pstart = tmp;
2174fee23f9Smrg *pend = nend;
2184fee23f9Smrg return true;
2194fee23f9Smrg }
2204fee23f9Smrg }
2214fee23f9Smrg
2224d5abbe8Smrg start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
2234fee23f9Smrg while (1)
2244fee23f9Smrg {
2254fee23f9Smrg gomp_ull left = end - start;
2264fee23f9Smrg gomp_ull tmp;
2274fee23f9Smrg
2284fee23f9Smrg if (start == end)
2294fee23f9Smrg return false;
2304fee23f9Smrg
2314fee23f9Smrg if (__builtin_expect (ws->mode & 2, 0))
2324fee23f9Smrg {
2334fee23f9Smrg if (chunk < left)
2344fee23f9Smrg chunk = left;
2354fee23f9Smrg }
2364fee23f9Smrg else
2374fee23f9Smrg {
2384fee23f9Smrg if (chunk > left)
2394fee23f9Smrg chunk = left;
2404fee23f9Smrg }
2414fee23f9Smrg nend = start + chunk;
2424fee23f9Smrg
2434fee23f9Smrg tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
2444fee23f9Smrg if (__builtin_expect (tmp == start, 1))
2454fee23f9Smrg break;
2464fee23f9Smrg
2474fee23f9Smrg start = tmp;
2484fee23f9Smrg }
2494fee23f9Smrg
2504fee23f9Smrg *pstart = start;
2514fee23f9Smrg *pend = nend;
2524fee23f9Smrg return true;
2534fee23f9Smrg }
2544fee23f9Smrg #endif /* HAVE_SYNC_BUILTINS */
2554fee23f9Smrg
2564fee23f9Smrg
2574fee23f9Smrg /* This function implements the GUIDED scheduling method. Arguments are
2584fee23f9Smrg as for gomp_iter_ull_static_next. This function must be called with the
2594fee23f9Smrg work share lock held. */
2604fee23f9Smrg
2614fee23f9Smrg bool
gomp_iter_ull_guided_next_locked(gomp_ull * pstart,gomp_ull * pend)2624fee23f9Smrg gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
2634fee23f9Smrg {
2644fee23f9Smrg struct gomp_thread *thr = gomp_thread ();
2654fee23f9Smrg struct gomp_work_share *ws = thr->ts.work_share;
2664fee23f9Smrg struct gomp_team *team = thr->ts.team;
2674fee23f9Smrg gomp_ull nthreads = team ? team->nthreads : 1;
2684fee23f9Smrg gomp_ull n, q;
2694fee23f9Smrg gomp_ull start, end;
2704fee23f9Smrg
2714fee23f9Smrg if (ws->next_ull == ws->end_ull)
2724fee23f9Smrg return false;
2734fee23f9Smrg
2744fee23f9Smrg start = ws->next_ull;
2754fee23f9Smrg if (__builtin_expect (ws->mode, 0) == 0)
2764fee23f9Smrg n = (ws->end_ull - start) / ws->incr_ull;
2774fee23f9Smrg else
2784fee23f9Smrg n = (start - ws->end_ull) / -ws->incr_ull;
2794fee23f9Smrg q = (n + nthreads - 1) / nthreads;
2804fee23f9Smrg
2814fee23f9Smrg if (q < ws->chunk_size_ull)
2824fee23f9Smrg q = ws->chunk_size_ull;
2834fee23f9Smrg if (q <= n)
2844fee23f9Smrg end = start + q * ws->incr_ull;
2854fee23f9Smrg else
2864fee23f9Smrg end = ws->end_ull;
2874fee23f9Smrg
2884fee23f9Smrg ws->next_ull = end;
2894fee23f9Smrg *pstart = start;
2904fee23f9Smrg *pend = end;
2914fee23f9Smrg return true;
2924fee23f9Smrg }
2934fee23f9Smrg
2944fee23f9Smrg #if defined HAVE_SYNC_BUILTINS && defined __LP64__
2954fee23f9Smrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
2964fee23f9Smrg instead. Note that the only memory value that changes is ws->next_ull. */
2974fee23f9Smrg
2984fee23f9Smrg bool
gomp_iter_ull_guided_next(gomp_ull * pstart,gomp_ull * pend)2994fee23f9Smrg gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
3004fee23f9Smrg {
3014fee23f9Smrg struct gomp_thread *thr = gomp_thread ();
3024fee23f9Smrg struct gomp_work_share *ws = thr->ts.work_share;
3034fee23f9Smrg struct gomp_team *team = thr->ts.team;
3044fee23f9Smrg gomp_ull nthreads = team ? team->nthreads : 1;
3054fee23f9Smrg gomp_ull start, end, nend, incr;
3064fee23f9Smrg gomp_ull chunk_size;
3074fee23f9Smrg
3084d5abbe8Smrg start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
3094fee23f9Smrg end = ws->end_ull;
3104fee23f9Smrg incr = ws->incr_ull;
3114fee23f9Smrg chunk_size = ws->chunk_size_ull;
3124fee23f9Smrg
3134fee23f9Smrg while (1)
3144fee23f9Smrg {
3154fee23f9Smrg gomp_ull n, q;
3164fee23f9Smrg gomp_ull tmp;
3174fee23f9Smrg
3184fee23f9Smrg if (start == end)
3194fee23f9Smrg return false;
3204fee23f9Smrg
3214fee23f9Smrg if (__builtin_expect (ws->mode, 0) == 0)
3224fee23f9Smrg n = (end - start) / incr;
3234fee23f9Smrg else
3244fee23f9Smrg n = (start - end) / -incr;
3254fee23f9Smrg q = (n + nthreads - 1) / nthreads;
3264fee23f9Smrg
3274fee23f9Smrg if (q < chunk_size)
3284fee23f9Smrg q = chunk_size;
3294fee23f9Smrg if (__builtin_expect (q <= n, 1))
3304fee23f9Smrg nend = start + q * incr;
3314fee23f9Smrg else
3324fee23f9Smrg nend = end;
3334fee23f9Smrg
3344fee23f9Smrg tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
3354fee23f9Smrg if (__builtin_expect (tmp == start, 1))
3364fee23f9Smrg break;
3374fee23f9Smrg
3384fee23f9Smrg start = tmp;
3394fee23f9Smrg }
3404fee23f9Smrg
3414fee23f9Smrg *pstart = start;
3424fee23f9Smrg *pend = nend;
3434fee23f9Smrg return true;
3444fee23f9Smrg }
3454fee23f9Smrg #endif /* HAVE_SYNC_BUILTINS */
346