1*b1e83836Smrg /* Copyright (C) 2005-2022 Free Software Foundation, Inc.
24fee23f9Smrg Contributed by Richard Henderson <rth@redhat.com>.
34fee23f9Smrg
44d5abbe8Smrg This file is part of the GNU Offloading and Multi Processing Library
54d5abbe8Smrg (libgomp).
64fee23f9Smrg
74fee23f9Smrg Libgomp is free software; you can redistribute it and/or modify it
84fee23f9Smrg under the terms of the GNU General Public License as published by
94fee23f9Smrg the Free Software Foundation; either version 3, or (at your option)
104fee23f9Smrg any later version.
114fee23f9Smrg
124fee23f9Smrg Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
134fee23f9Smrg WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
144fee23f9Smrg FOR A PARTICULAR PURPOSE. See the GNU General Public License for
154fee23f9Smrg more details.
164fee23f9Smrg
174fee23f9Smrg Under Section 7 of GPL version 3, you are granted additional
184fee23f9Smrg permissions described in the GCC Runtime Library Exception, version
194fee23f9Smrg 3.1, as published by the Free Software Foundation.
204fee23f9Smrg
214fee23f9Smrg You should have received a copy of the GNU General Public License and
224fee23f9Smrg a copy of the GCC Runtime Library Exception along with this program;
234fee23f9Smrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
244fee23f9Smrg <http://www.gnu.org/licenses/>. */
254fee23f9Smrg
264fee23f9Smrg /* This file contains routines for managing work-share iteration, both
274fee23f9Smrg for loops and sections. */
284fee23f9Smrg
294fee23f9Smrg #include "libgomp.h"
304fee23f9Smrg #include <stdlib.h>
314fee23f9Smrg
324fee23f9Smrg
334fee23f9Smrg /* This function implements the STATIC scheduling method. The caller should
344fee23f9Smrg iterate *pstart <= x < *pend. Return zero if there are more iterations
354fee23f9Smrg to perform; nonzero if not. Return less than 0 if this thread had
364fee23f9Smrg received the absolutely last iteration. */
374fee23f9Smrg
384fee23f9Smrg int
gomp_iter_static_next(long * pstart,long * pend)394fee23f9Smrg gomp_iter_static_next (long *pstart, long *pend)
404fee23f9Smrg {
414fee23f9Smrg struct gomp_thread *thr = gomp_thread ();
424fee23f9Smrg struct gomp_team *team = thr->ts.team;
434fee23f9Smrg struct gomp_work_share *ws = thr->ts.work_share;
444fee23f9Smrg unsigned long nthreads = team ? team->nthreads : 1;
454fee23f9Smrg
464fee23f9Smrg if (thr->ts.static_trip == -1)
474fee23f9Smrg return -1;
484fee23f9Smrg
494fee23f9Smrg /* Quick test for degenerate teams and orphaned constructs. */
504fee23f9Smrg if (nthreads == 1)
514fee23f9Smrg {
524fee23f9Smrg *pstart = ws->next;
534fee23f9Smrg *pend = ws->end;
544fee23f9Smrg thr->ts.static_trip = -1;
554fee23f9Smrg return ws->next == ws->end;
564fee23f9Smrg }
574fee23f9Smrg
584fee23f9Smrg /* We interpret chunk_size zero as "unspecified", which means that we
594fee23f9Smrg should break up the iterations such that each thread makes only one
604fee23f9Smrg trip through the outer loop. */
614fee23f9Smrg if (ws->chunk_size == 0)
624fee23f9Smrg {
6348fb7bfaSmrg unsigned long n, q, i, t;
644fee23f9Smrg unsigned long s0, e0;
654fee23f9Smrg long s, e;
664fee23f9Smrg
674fee23f9Smrg if (thr->ts.static_trip > 0)
684fee23f9Smrg return 1;
694fee23f9Smrg
704fee23f9Smrg /* Compute the total number of iterations. */
714fee23f9Smrg s = ws->incr + (ws->incr > 0 ? -1 : 1);
724fee23f9Smrg n = (ws->end - ws->next + s) / ws->incr;
734fee23f9Smrg i = thr->ts.team_id;
744fee23f9Smrg
754fee23f9Smrg /* Compute the "zero-based" start and end points. That is, as
764fee23f9Smrg if the loop began at zero and incremented by one. */
774fee23f9Smrg q = n / nthreads;
7848fb7bfaSmrg t = n % nthreads;
7948fb7bfaSmrg if (i < t)
8048fb7bfaSmrg {
8148fb7bfaSmrg t = 0;
8248fb7bfaSmrg q++;
8348fb7bfaSmrg }
8448fb7bfaSmrg s0 = q * i + t;
854fee23f9Smrg e0 = s0 + q;
864fee23f9Smrg
874fee23f9Smrg /* Notice when no iterations allocated for this thread. */
884fee23f9Smrg if (s0 >= e0)
894fee23f9Smrg {
904fee23f9Smrg thr->ts.static_trip = 1;
914fee23f9Smrg return 1;
924fee23f9Smrg }
934fee23f9Smrg
944fee23f9Smrg /* Transform these to the actual start and end numbers. */
954fee23f9Smrg s = (long)s0 * ws->incr + ws->next;
964fee23f9Smrg e = (long)e0 * ws->incr + ws->next;
974fee23f9Smrg
984fee23f9Smrg *pstart = s;
994fee23f9Smrg *pend = e;
1004fee23f9Smrg thr->ts.static_trip = (e0 == n ? -1 : 1);
1014fee23f9Smrg return 0;
1024fee23f9Smrg }
1034fee23f9Smrg else
1044fee23f9Smrg {
1054fee23f9Smrg unsigned long n, s0, e0, i, c;
1064fee23f9Smrg long s, e;
1074fee23f9Smrg
1084fee23f9Smrg /* Otherwise, each thread gets exactly chunk_size iterations
1094fee23f9Smrg (if available) each time through the loop. */
1104fee23f9Smrg
1114fee23f9Smrg s = ws->incr + (ws->incr > 0 ? -1 : 1);
1124fee23f9Smrg n = (ws->end - ws->next + s) / ws->incr;
1134fee23f9Smrg i = thr->ts.team_id;
1144fee23f9Smrg c = ws->chunk_size;
1154fee23f9Smrg
1164fee23f9Smrg /* Initial guess is a C sized chunk positioned nthreads iterations
1174fee23f9Smrg in, offset by our thread number. */
1184fee23f9Smrg s0 = (thr->ts.static_trip * nthreads + i) * c;
1194fee23f9Smrg e0 = s0 + c;
1204fee23f9Smrg
1214fee23f9Smrg /* Detect overflow. */
1224fee23f9Smrg if (s0 >= n)
1234fee23f9Smrg return 1;
1244fee23f9Smrg if (e0 > n)
1254fee23f9Smrg e0 = n;
1264fee23f9Smrg
1274fee23f9Smrg /* Transform these to the actual start and end numbers. */
1284fee23f9Smrg s = (long)s0 * ws->incr + ws->next;
1294fee23f9Smrg e = (long)e0 * ws->incr + ws->next;
1304fee23f9Smrg
1314fee23f9Smrg *pstart = s;
1324fee23f9Smrg *pend = e;
1334fee23f9Smrg
1344fee23f9Smrg if (e0 == n)
1354fee23f9Smrg thr->ts.static_trip = -1;
1364fee23f9Smrg else
1374fee23f9Smrg thr->ts.static_trip++;
1384fee23f9Smrg return 0;
1394fee23f9Smrg }
1404fee23f9Smrg }
1414fee23f9Smrg
1424fee23f9Smrg
1434fee23f9Smrg /* This function implements the DYNAMIC scheduling method. Arguments are
1444fee23f9Smrg as for gomp_iter_static_next. This function must be called with ws->lock
1454fee23f9Smrg held. */
1464fee23f9Smrg
1474fee23f9Smrg bool
gomp_iter_dynamic_next_locked(long * pstart,long * pend)1484fee23f9Smrg gomp_iter_dynamic_next_locked (long *pstart, long *pend)
1494fee23f9Smrg {
1504fee23f9Smrg struct gomp_thread *thr = gomp_thread ();
1514fee23f9Smrg struct gomp_work_share *ws = thr->ts.work_share;
1524fee23f9Smrg long start, end, chunk, left;
1534fee23f9Smrg
1544fee23f9Smrg start = ws->next;
1554fee23f9Smrg if (start == ws->end)
1564fee23f9Smrg return false;
1574fee23f9Smrg
1584fee23f9Smrg chunk = ws->chunk_size;
1594fee23f9Smrg left = ws->end - start;
1604fee23f9Smrg if (ws->incr < 0)
1614fee23f9Smrg {
1624fee23f9Smrg if (chunk < left)
1634fee23f9Smrg chunk = left;
1644fee23f9Smrg }
1654fee23f9Smrg else
1664fee23f9Smrg {
1674fee23f9Smrg if (chunk > left)
1684fee23f9Smrg chunk = left;
1694fee23f9Smrg }
1704fee23f9Smrg end = start + chunk;
1714fee23f9Smrg
1724fee23f9Smrg ws->next = end;
1734fee23f9Smrg *pstart = start;
1744fee23f9Smrg *pend = end;
1754fee23f9Smrg return true;
1764fee23f9Smrg }
1774fee23f9Smrg
1784fee23f9Smrg
1794fee23f9Smrg #ifdef HAVE_SYNC_BUILTINS
1804fee23f9Smrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
1814fee23f9Smrg instead. Note that the only memory value that changes is ws->next. */
1824fee23f9Smrg
1834fee23f9Smrg bool
gomp_iter_dynamic_next(long * pstart,long * pend)1844fee23f9Smrg gomp_iter_dynamic_next (long *pstart, long *pend)
1854fee23f9Smrg {
1864fee23f9Smrg struct gomp_thread *thr = gomp_thread ();
1874fee23f9Smrg struct gomp_work_share *ws = thr->ts.work_share;
1884fee23f9Smrg long start, end, nend, chunk, incr;
1894fee23f9Smrg
1904fee23f9Smrg end = ws->end;
1914fee23f9Smrg incr = ws->incr;
1924fee23f9Smrg chunk = ws->chunk_size;
1934fee23f9Smrg
1944fee23f9Smrg if (__builtin_expect (ws->mode, 1))
1954fee23f9Smrg {
1964fee23f9Smrg long tmp = __sync_fetch_and_add (&ws->next, chunk);
1974fee23f9Smrg if (incr > 0)
1984fee23f9Smrg {
1994fee23f9Smrg if (tmp >= end)
2004fee23f9Smrg return false;
2014fee23f9Smrg nend = tmp + chunk;
2024fee23f9Smrg if (nend > end)
2034fee23f9Smrg nend = end;
2044fee23f9Smrg *pstart = tmp;
2054fee23f9Smrg *pend = nend;
2064fee23f9Smrg return true;
2074fee23f9Smrg }
2084fee23f9Smrg else
2094fee23f9Smrg {
2104fee23f9Smrg if (tmp <= end)
2114fee23f9Smrg return false;
2124fee23f9Smrg nend = tmp + chunk;
2134fee23f9Smrg if (nend < end)
2144fee23f9Smrg nend = end;
2154fee23f9Smrg *pstart = tmp;
2164fee23f9Smrg *pend = nend;
2174fee23f9Smrg return true;
2184fee23f9Smrg }
2194fee23f9Smrg }
2204fee23f9Smrg
2214d5abbe8Smrg start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
2224fee23f9Smrg while (1)
2234fee23f9Smrg {
2244fee23f9Smrg long left = end - start;
2254fee23f9Smrg long tmp;
2264fee23f9Smrg
2274fee23f9Smrg if (start == end)
2284fee23f9Smrg return false;
2294fee23f9Smrg
2304fee23f9Smrg if (incr < 0)
2314fee23f9Smrg {
2324fee23f9Smrg if (chunk < left)
2334fee23f9Smrg chunk = left;
2344fee23f9Smrg }
2354fee23f9Smrg else
2364fee23f9Smrg {
2374fee23f9Smrg if (chunk > left)
2384fee23f9Smrg chunk = left;
2394fee23f9Smrg }
2404fee23f9Smrg nend = start + chunk;
2414fee23f9Smrg
2424fee23f9Smrg tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
2434fee23f9Smrg if (__builtin_expect (tmp == start, 1))
2444fee23f9Smrg break;
2454fee23f9Smrg
2464fee23f9Smrg start = tmp;
2474fee23f9Smrg }
2484fee23f9Smrg
2494fee23f9Smrg *pstart = start;
2504fee23f9Smrg *pend = nend;
2514fee23f9Smrg return true;
2524fee23f9Smrg }
2534fee23f9Smrg #endif /* HAVE_SYNC_BUILTINS */
2544fee23f9Smrg
2554fee23f9Smrg
2564fee23f9Smrg /* This function implements the GUIDED scheduling method. Arguments are
2574fee23f9Smrg as for gomp_iter_static_next. This function must be called with the
2584fee23f9Smrg work share lock held. */
2594fee23f9Smrg
2604fee23f9Smrg bool
gomp_iter_guided_next_locked(long * pstart,long * pend)2614fee23f9Smrg gomp_iter_guided_next_locked (long *pstart, long *pend)
2624fee23f9Smrg {
2634fee23f9Smrg struct gomp_thread *thr = gomp_thread ();
2644fee23f9Smrg struct gomp_work_share *ws = thr->ts.work_share;
2654fee23f9Smrg struct gomp_team *team = thr->ts.team;
2664fee23f9Smrg unsigned long nthreads = team ? team->nthreads : 1;
2674fee23f9Smrg unsigned long n, q;
2684fee23f9Smrg long start, end;
2694fee23f9Smrg
2704fee23f9Smrg if (ws->next == ws->end)
2714fee23f9Smrg return false;
2724fee23f9Smrg
2734fee23f9Smrg start = ws->next;
2744fee23f9Smrg n = (ws->end - start) / ws->incr;
2754fee23f9Smrg q = (n + nthreads - 1) / nthreads;
2764fee23f9Smrg
2774fee23f9Smrg if (q < ws->chunk_size)
2784fee23f9Smrg q = ws->chunk_size;
2794fee23f9Smrg if (q <= n)
2804fee23f9Smrg end = start + q * ws->incr;
2814fee23f9Smrg else
2824fee23f9Smrg end = ws->end;
2834fee23f9Smrg
2844fee23f9Smrg ws->next = end;
2854fee23f9Smrg *pstart = start;
2864fee23f9Smrg *pend = end;
2874fee23f9Smrg return true;
2884fee23f9Smrg }
2894fee23f9Smrg
2904fee23f9Smrg #ifdef HAVE_SYNC_BUILTINS
2914fee23f9Smrg /* Similar, but doesn't require the lock held, and uses compare-and-swap
2924fee23f9Smrg instead. Note that the only memory value that changes is ws->next. */
2934fee23f9Smrg
2944fee23f9Smrg bool
gomp_iter_guided_next(long * pstart,long * pend)2954fee23f9Smrg gomp_iter_guided_next (long *pstart, long *pend)
2964fee23f9Smrg {
2974fee23f9Smrg struct gomp_thread *thr = gomp_thread ();
2984fee23f9Smrg struct gomp_work_share *ws = thr->ts.work_share;
2994fee23f9Smrg struct gomp_team *team = thr->ts.team;
3004fee23f9Smrg unsigned long nthreads = team ? team->nthreads : 1;
3014fee23f9Smrg long start, end, nend, incr;
3024fee23f9Smrg unsigned long chunk_size;
3034fee23f9Smrg
3044d5abbe8Smrg start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
3054fee23f9Smrg end = ws->end;
3064fee23f9Smrg incr = ws->incr;
3074fee23f9Smrg chunk_size = ws->chunk_size;
3084fee23f9Smrg
3094fee23f9Smrg while (1)
3104fee23f9Smrg {
3114fee23f9Smrg unsigned long n, q;
3124fee23f9Smrg long tmp;
3134fee23f9Smrg
3144fee23f9Smrg if (start == end)
3154fee23f9Smrg return false;
3164fee23f9Smrg
3174fee23f9Smrg n = (end - start) / incr;
3184fee23f9Smrg q = (n + nthreads - 1) / nthreads;
3194fee23f9Smrg
3204fee23f9Smrg if (q < chunk_size)
3214fee23f9Smrg q = chunk_size;
3224fee23f9Smrg if (__builtin_expect (q <= n, 1))
3234fee23f9Smrg nend = start + q * incr;
3244fee23f9Smrg else
3254fee23f9Smrg nend = end;
3264fee23f9Smrg
3274fee23f9Smrg tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
3284fee23f9Smrg if (__builtin_expect (tmp == start, 1))
3294fee23f9Smrg break;
3304fee23f9Smrg
3314fee23f9Smrg start = tmp;
3324fee23f9Smrg }
3334fee23f9Smrg
3344fee23f9Smrg *pstart = start;
3354fee23f9Smrg *pend = nend;
3364fee23f9Smrg return true;
3374fee23f9Smrg }
3384fee23f9Smrg #endif /* HAVE_SYNC_BUILTINS */
339