xref: /llvm-project/pstl/include/pstl/internal/omp/parallel_transform_reduce.h (revision 843c12d6a0cdfd64c5a92e24eb58ba9ee17ca1ee)
1 // -*- C++ -*-
2 // -*-===----------------------------------------------------------------------===//
3 //
4 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5 //
6 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
7 // See https://llvm.org/LICENSE.txt for license information.
8 //
9 //===----------------------------------------------------------------------===//
10 
11 #ifndef _PSTL_INTERNAL_OMP_PARALLEL_TRANSFORM_REDUCE_H
12 #define _PSTL_INTERNAL_OMP_PARALLEL_TRANSFORM_REDUCE_H
13 
14 #include "util.h"
15 
16 namespace __pstl
17 {
18 namespace __omp_backend
19 {
20 
21 //------------------------------------------------------------------------
22 // parallel_transform_reduce
23 //
24 // Notation:
25 //      r(i,j,init) returns reduction of init with reduction over [i,j)
26 //      u(i) returns f(i,i+1,identity) for a hypothetical left identity element
27 //      of r c(x,y) combines values x and y that were the result of r or u
28 //------------------------------------------------------------------------
29 
30 template <class _RandomAccessIterator, class _UnaryOp, class _Value, class _Combiner, class _Reduction>
31 auto
__transform_reduce_body(_RandomAccessIterator __first,_RandomAccessIterator __last,_UnaryOp __unary_op,_Value __init,_Combiner __combiner,_Reduction __reduction)32 __transform_reduce_body(_RandomAccessIterator __first, _RandomAccessIterator __last, _UnaryOp __unary_op, _Value __init,
33                         _Combiner __combiner, _Reduction __reduction)
34 {
35     const std::size_t __num_threads = omp_get_num_threads();
36     const std::size_t __size = __last - __first;
37 
38     // Initial partition of the iteration space into chunks. If the range is too small,
39     // this will result in a nonsense policy, so we check on the size as well below.
40     auto __policy = __omp_backend::__chunk_partitioner(__first + __num_threads, __last);
41 
42     if (__size <= __num_threads || __policy.__n_chunks < 2)
43     {
44         return __reduction(__first, __last, __init);
45     }
46 
47     // Here, we cannot use OpenMP UDR because we must store the init value in
48     // the combiner and it will be used several times. Although there should be
49     // the only one; we manually generate the identity elements for each thread.
50     std::vector<_Value> __accums;
51     __accums.reserve(__num_threads);
52 
53     // initialize accumulators for all threads
54     for (std::size_t __i = 0; __i < __num_threads; ++__i)
55     {
56         __accums.emplace_back(__unary_op(__first + __i));
57     }
58 
59     // main loop
60     _PSTL_PRAGMA(omp taskloop shared(__accums))
61     for (std::size_t __chunk = 0; __chunk < __policy.__n_chunks; ++__chunk)
62     {
63         __pstl::__omp_backend::__process_chunk(__policy, __first + __num_threads, __chunk,
64                                        [&](auto __chunk_first, auto __chunk_last)
65                                        {
66                                            auto __thread_num = omp_get_thread_num();
67                                            __accums[__thread_num] =
68                                                __reduction(__chunk_first, __chunk_last, __accums[__thread_num]);
69                                        });
70     }
71 
72     // combine by accumulators
73     for (std::size_t __i = 0; __i < __num_threads; ++__i)
74     {
75         __init = __combiner(__init, __accums[__i]);
76     }
77 
78     return __init;
79 }
80 
81 template <class _ExecutionPolicy, class _RandomAccessIterator, class _UnaryOp, class _Value, class _Combiner,
82           class _Reduction>
83 _Value
__parallel_transform_reduce(__pstl::__internal::__openmp_backend_tag,_ExecutionPolicy &&,_RandomAccessIterator __first,_RandomAccessIterator __last,_UnaryOp __unary_op,_Value __init,_Combiner __combiner,_Reduction __reduction)84 __parallel_transform_reduce(__pstl::__internal::__openmp_backend_tag, _ExecutionPolicy&&, _RandomAccessIterator __first,
85                             _RandomAccessIterator __last, _UnaryOp __unary_op, _Value __init, _Combiner __combiner,
86                             _Reduction __reduction)
87 {
88     _Value __result = __init;
89     if (omp_in_parallel())
90     {
91         // We don't create a nested parallel region in an existing parallel
92         // region: just create tasks
93         __result = __pstl::__omp_backend::__transform_reduce_body(__first, __last, __unary_op, __init, __combiner,
94                                                                   __reduction);
95     }
96     else
97     {
98         // Create a parallel region, and a single thread will create tasks
99         // for the region.
100         _PSTL_PRAGMA(omp parallel)
101         _PSTL_PRAGMA(omp single nowait)
102         {
103             __result = __pstl::__omp_backend::__transform_reduce_body(__first, __last, __unary_op, __init, __combiner,
104                                                                       __reduction);
105         }
106     }
107 
108     return __result;
109 }
110 
111 } // namespace __omp_backend
112 } // namespace __pstl
113 #endif // _PSTL_INTERNAL_OMP_PARALLEL_TRANSFORM_REDUCE_H
114