xref: /netbsd-src/external/apache2/llvm/dist/libcxx/include/barrier (revision 4d6fc14bc9b0c5bf3e30be318c143ee82cadd108)
1*4d6fc14bSjoerg// -*- C++ -*-
2*4d6fc14bSjoerg//===--------------------------- barrier ----------------------------------===//
3*4d6fc14bSjoerg//
4*4d6fc14bSjoerg// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5*4d6fc14bSjoerg// See https://llvm.org/LICENSE.txt for license information.
6*4d6fc14bSjoerg// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7*4d6fc14bSjoerg//
8*4d6fc14bSjoerg//===----------------------------------------------------------------------===//
9*4d6fc14bSjoerg
10*4d6fc14bSjoerg#ifndef _LIBCPP_BARRIER
11*4d6fc14bSjoerg#define _LIBCPP_BARRIER
12*4d6fc14bSjoerg
13*4d6fc14bSjoerg/*
14*4d6fc14bSjoerg    barrier synopsis
15*4d6fc14bSjoerg
16*4d6fc14bSjoergnamespace std
17*4d6fc14bSjoerg{
18*4d6fc14bSjoerg
19*4d6fc14bSjoerg  template<class CompletionFunction = see below>
20*4d6fc14bSjoerg  class barrier
21*4d6fc14bSjoerg  {
22*4d6fc14bSjoerg  public:
23*4d6fc14bSjoerg    using arrival_token = see below;
24*4d6fc14bSjoerg
25*4d6fc14bSjoerg    static constexpr ptrdiff_t max() noexcept;
26*4d6fc14bSjoerg
27*4d6fc14bSjoerg    constexpr explicit barrier(ptrdiff_t phase_count,
28*4d6fc14bSjoerg                               CompletionFunction f = CompletionFunction());
29*4d6fc14bSjoerg    ~barrier();
30*4d6fc14bSjoerg
31*4d6fc14bSjoerg    barrier(const barrier&) = delete;
32*4d6fc14bSjoerg    barrier& operator=(const barrier&) = delete;
33*4d6fc14bSjoerg
34*4d6fc14bSjoerg    [[nodiscard]] arrival_token arrive(ptrdiff_t update = 1);
35*4d6fc14bSjoerg    void wait(arrival_token&& arrival) const;
36*4d6fc14bSjoerg
37*4d6fc14bSjoerg    void arrive_and_wait();
38*4d6fc14bSjoerg    void arrive_and_drop();
39*4d6fc14bSjoerg
40*4d6fc14bSjoerg  private:
41*4d6fc14bSjoerg    CompletionFunction completion; // exposition only
42*4d6fc14bSjoerg  };
43*4d6fc14bSjoerg
44*4d6fc14bSjoerg}
45*4d6fc14bSjoerg
46*4d6fc14bSjoerg*/
47*4d6fc14bSjoerg
48*4d6fc14bSjoerg#include <__config>
49*4d6fc14bSjoerg#include <__availability>
50*4d6fc14bSjoerg#include <atomic>
51*4d6fc14bSjoerg#ifndef _LIBCPP_HAS_NO_TREE_BARRIER
52*4d6fc14bSjoerg# include <memory>
53*4d6fc14bSjoerg#endif
54*4d6fc14bSjoerg
55*4d6fc14bSjoerg#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
56*4d6fc14bSjoerg#pragma GCC system_header
57*4d6fc14bSjoerg#endif
58*4d6fc14bSjoerg
59*4d6fc14bSjoerg#ifdef _LIBCPP_HAS_NO_THREADS
60*4d6fc14bSjoerg# error <barrier> is not supported on this single threaded system
61*4d6fc14bSjoerg#endif
62*4d6fc14bSjoerg
63*4d6fc14bSjoerg_LIBCPP_PUSH_MACROS
64*4d6fc14bSjoerg#include <__undef_macros>
65*4d6fc14bSjoerg
66*4d6fc14bSjoerg#if _LIBCPP_STD_VER >= 14
67*4d6fc14bSjoerg
68*4d6fc14bSjoerg_LIBCPP_BEGIN_NAMESPACE_STD
69*4d6fc14bSjoerg
70*4d6fc14bSjoergstruct __empty_completion
71*4d6fc14bSjoerg{
72*4d6fc14bSjoerg    inline _LIBCPP_INLINE_VISIBILITY
73*4d6fc14bSjoerg    void operator()() noexcept
74*4d6fc14bSjoerg    {
75*4d6fc14bSjoerg    }
76*4d6fc14bSjoerg};
77*4d6fc14bSjoerg
78*4d6fc14bSjoerg#ifndef _LIBCPP_HAS_NO_TREE_BARRIER
79*4d6fc14bSjoerg
80*4d6fc14bSjoerg/*
81*4d6fc14bSjoerg
82*4d6fc14bSjoergThe default implementation of __barrier_base is a classic tree barrier.
83*4d6fc14bSjoerg
84*4d6fc14bSjoergIt looks different from literature pseudocode for two main reasons:
85*4d6fc14bSjoerg 1. Threads that call into std::barrier functions do not provide indices,
86*4d6fc14bSjoerg    so a numbering step is added before the actual barrier algorithm,
87*4d6fc14bSjoerg    appearing as an N+1 round to the N rounds of the tree barrier.
88*4d6fc14bSjoerg 2. A great deal of attention has been paid to avoid cache line thrashing
89*4d6fc14bSjoerg    by flattening the tree structure into cache-line sized arrays, that
90*4d6fc14bSjoerg    are indexed in an efficient way.
91*4d6fc14bSjoerg
92*4d6fc14bSjoerg*/
93*4d6fc14bSjoerg
94*4d6fc14bSjoergusing __barrier_phase_t = uint8_t;
95*4d6fc14bSjoerg
96*4d6fc14bSjoergclass __barrier_algorithm_base;
97*4d6fc14bSjoerg
98*4d6fc14bSjoerg_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI
99*4d6fc14bSjoerg__barrier_algorithm_base* __construct_barrier_algorithm_base(ptrdiff_t& __expected);
100*4d6fc14bSjoerg
101*4d6fc14bSjoerg_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI
102*4d6fc14bSjoergbool __arrive_barrier_algorithm_base(__barrier_algorithm_base* __barrier,
103*4d6fc14bSjoerg                                     __barrier_phase_t __old_phase);
104*4d6fc14bSjoerg
105*4d6fc14bSjoerg_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI
106*4d6fc14bSjoergvoid __destroy_barrier_algorithm_base(__barrier_algorithm_base* __barrier);
107*4d6fc14bSjoerg
108*4d6fc14bSjoergtemplate<class _CompletionF>
109*4d6fc14bSjoergclass __barrier_base {
110*4d6fc14bSjoerg
111*4d6fc14bSjoerg    ptrdiff_t                                               __expected;
112*4d6fc14bSjoerg    unique_ptr<__barrier_algorithm_base,
113*4d6fc14bSjoerg               void (*)(__barrier_algorithm_base*)>         __base;
114*4d6fc14bSjoerg    __atomic_base<ptrdiff_t>                                __expected_adjustment;
115*4d6fc14bSjoerg    _CompletionF                                            __completion;
116*4d6fc14bSjoerg    __atomic_base<__barrier_phase_t>                        __phase;
117*4d6fc14bSjoerg
118*4d6fc14bSjoergpublic:
119*4d6fc14bSjoerg    using arrival_token = __barrier_phase_t;
120*4d6fc14bSjoerg
121*4d6fc14bSjoerg    static constexpr ptrdiff_t max() noexcept {
122*4d6fc14bSjoerg        return numeric_limits<ptrdiff_t>::max();
123*4d6fc14bSjoerg    }
124*4d6fc14bSjoerg
125*4d6fc14bSjoerg    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
126*4d6fc14bSjoerg    __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF())
127*4d6fc14bSjoerg            : __expected(__expected), __base(__construct_barrier_algorithm_base(this->__expected),
128*4d6fc14bSjoerg                                             &__destroy_barrier_algorithm_base),
129*4d6fc14bSjoerg              __expected_adjustment(0), __completion(move(__completion)), __phase(0)
130*4d6fc14bSjoerg    {
131*4d6fc14bSjoerg    }
132*4d6fc14bSjoerg    [[nodiscard]] _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
133*4d6fc14bSjoerg    arrival_token arrive(ptrdiff_t update)
134*4d6fc14bSjoerg    {
135*4d6fc14bSjoerg        auto const __old_phase = __phase.load(memory_order_relaxed);
136*4d6fc14bSjoerg        for(; update; --update)
137*4d6fc14bSjoerg            if(__arrive_barrier_algorithm_base(__base.get(), __old_phase)) {
138*4d6fc14bSjoerg                __completion();
139*4d6fc14bSjoerg                __expected += __expected_adjustment.load(memory_order_relaxed);
140*4d6fc14bSjoerg                __expected_adjustment.store(0, memory_order_relaxed);
141*4d6fc14bSjoerg                __phase.store(__old_phase + 2, memory_order_release);
142*4d6fc14bSjoerg                __phase.notify_all();
143*4d6fc14bSjoerg            }
144*4d6fc14bSjoerg        return __old_phase;
145*4d6fc14bSjoerg    }
146*4d6fc14bSjoerg    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
147*4d6fc14bSjoerg    void wait(arrival_token&& __old_phase) const
148*4d6fc14bSjoerg    {
149*4d6fc14bSjoerg        auto const __test_fn = [=]() -> bool {
150*4d6fc14bSjoerg            return __phase.load(memory_order_acquire) != __old_phase;
151*4d6fc14bSjoerg        };
152*4d6fc14bSjoerg        __libcpp_thread_poll_with_backoff(__test_fn, __libcpp_timed_backoff_policy());
153*4d6fc14bSjoerg    }
154*4d6fc14bSjoerg    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
155*4d6fc14bSjoerg    void arrive_and_drop()
156*4d6fc14bSjoerg    {
157*4d6fc14bSjoerg        __expected_adjustment.fetch_sub(1, memory_order_relaxed);
158*4d6fc14bSjoerg        (void)arrive(1);
159*4d6fc14bSjoerg    }
160*4d6fc14bSjoerg};
161*4d6fc14bSjoerg
162*4d6fc14bSjoerg#else
163*4d6fc14bSjoerg
164*4d6fc14bSjoerg/*
165*4d6fc14bSjoerg
166*4d6fc14bSjoergThe alternative implementation of __barrier_base is a central barrier.
167*4d6fc14bSjoerg
168*4d6fc14bSjoergTwo versions of this algorithm are provided:
169*4d6fc14bSjoerg 1. A fairly straightforward implementation of the litterature for the
170*4d6fc14bSjoerg    general case where the completion function is not empty.
171*4d6fc14bSjoerg 2. An optimized implementation that exploits 2's complement arithmetic
172*4d6fc14bSjoerg    and well-defined overflow in atomic arithmetic, to handle the phase
173*4d6fc14bSjoerg    roll-over for free.
174*4d6fc14bSjoerg
175*4d6fc14bSjoerg*/
176*4d6fc14bSjoerg
177*4d6fc14bSjoergtemplate<class _CompletionF>
178*4d6fc14bSjoergclass __barrier_base {
179*4d6fc14bSjoerg
180*4d6fc14bSjoerg    __atomic_base<ptrdiff_t> __expected;
181*4d6fc14bSjoerg    __atomic_base<ptrdiff_t> __arrived;
182*4d6fc14bSjoerg    _CompletionF             __completion;
183*4d6fc14bSjoerg    __atomic_base<bool>      __phase;
184*4d6fc14bSjoergpublic:
185*4d6fc14bSjoerg    using arrival_token = bool;
186*4d6fc14bSjoerg
187*4d6fc14bSjoerg    static constexpr ptrdiff_t max() noexcept {
188*4d6fc14bSjoerg        return numeric_limits<ptrdiff_t>::max();
189*4d6fc14bSjoerg    }
190*4d6fc14bSjoerg
191*4d6fc14bSjoerg    _LIBCPP_INLINE_VISIBILITY
192*4d6fc14bSjoerg    __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF())
193*4d6fc14bSjoerg        : __expected(__expected), __arrived(__expected), __completion(move(__completion)), __phase(false)
194*4d6fc14bSjoerg    {
195*4d6fc14bSjoerg    }
196*4d6fc14bSjoerg    [[nodiscard]] _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
197*4d6fc14bSjoerg    arrival_token arrive(ptrdiff_t update)
198*4d6fc14bSjoerg    {
199*4d6fc14bSjoerg        auto const __old_phase = __phase.load(memory_order_relaxed);
200*4d6fc14bSjoerg        auto const __result = __arrived.fetch_sub(update, memory_order_acq_rel) - update;
201*4d6fc14bSjoerg        auto const new_expected = __expected.load(memory_order_relaxed);
202*4d6fc14bSjoerg        if(0 == __result) {
203*4d6fc14bSjoerg            __completion();
204*4d6fc14bSjoerg            __arrived.store(new_expected, memory_order_relaxed);
205*4d6fc14bSjoerg            __phase.store(!__old_phase, memory_order_release);
206*4d6fc14bSjoerg            __phase.notify_all();
207*4d6fc14bSjoerg        }
208*4d6fc14bSjoerg        return __old_phase;
209*4d6fc14bSjoerg    }
210*4d6fc14bSjoerg    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
211*4d6fc14bSjoerg    void wait(arrival_token&& __old_phase) const
212*4d6fc14bSjoerg    {
213*4d6fc14bSjoerg        __phase.wait(__old_phase, memory_order_acquire);
214*4d6fc14bSjoerg    }
215*4d6fc14bSjoerg    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
216*4d6fc14bSjoerg    void arrive_and_drop()
217*4d6fc14bSjoerg    {
218*4d6fc14bSjoerg        __expected.fetch_sub(1, memory_order_relaxed);
219*4d6fc14bSjoerg        (void)arrive(1);
220*4d6fc14bSjoerg    }
221*4d6fc14bSjoerg};
222*4d6fc14bSjoerg
223*4d6fc14bSjoergtemplate<>
224*4d6fc14bSjoergclass __barrier_base<__empty_completion> {
225*4d6fc14bSjoerg
226*4d6fc14bSjoerg    static constexpr uint64_t __expected_unit = 1ull;
227*4d6fc14bSjoerg    static constexpr uint64_t __arrived_unit = 1ull << 32;
228*4d6fc14bSjoerg    static constexpr uint64_t __expected_mask = __arrived_unit - 1;
229*4d6fc14bSjoerg    static constexpr uint64_t __phase_bit = 1ull << 63;
230*4d6fc14bSjoerg    static constexpr uint64_t __arrived_mask = (__phase_bit - 1) & ~__expected_mask;
231*4d6fc14bSjoerg
232*4d6fc14bSjoerg    __atomic_base<uint64_t>   __phase_arrived_expected;
233*4d6fc14bSjoerg
234*4d6fc14bSjoerg    static _LIBCPP_INLINE_VISIBILITY
235*4d6fc14bSjoerg    constexpr uint64_t __init(ptrdiff_t __count) _NOEXCEPT
236*4d6fc14bSjoerg    {
237*4d6fc14bSjoerg        return ((uint64_t(1u << 31) - __count) << 32)
238*4d6fc14bSjoerg              | (uint64_t(1u << 31) - __count);
239*4d6fc14bSjoerg    }
240*4d6fc14bSjoerg
241*4d6fc14bSjoergpublic:
242*4d6fc14bSjoerg    using arrival_token = uint64_t;
243*4d6fc14bSjoerg
244*4d6fc14bSjoerg    static constexpr ptrdiff_t max() noexcept {
245*4d6fc14bSjoerg        return ptrdiff_t(1u << 31) - 1;
246*4d6fc14bSjoerg    }
247*4d6fc14bSjoerg
248*4d6fc14bSjoerg    _LIBCPP_INLINE_VISIBILITY
249*4d6fc14bSjoerg    explicit inline __barrier_base(ptrdiff_t __count, __empty_completion = __empty_completion())
250*4d6fc14bSjoerg        : __phase_arrived_expected(__init(__count))
251*4d6fc14bSjoerg    {
252*4d6fc14bSjoerg    }
253*4d6fc14bSjoerg    [[nodiscard]] inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
254*4d6fc14bSjoerg    arrival_token arrive(ptrdiff_t update)
255*4d6fc14bSjoerg    {
256*4d6fc14bSjoerg        auto const __inc = __arrived_unit * update;
257*4d6fc14bSjoerg        auto const __old = __phase_arrived_expected.fetch_add(__inc, memory_order_acq_rel);
258*4d6fc14bSjoerg        if((__old ^ (__old + __inc)) & __phase_bit) {
259*4d6fc14bSjoerg            __phase_arrived_expected.fetch_add((__old & __expected_mask) << 32, memory_order_relaxed);
260*4d6fc14bSjoerg            __phase_arrived_expected.notify_all();
261*4d6fc14bSjoerg        }
262*4d6fc14bSjoerg        return __old & __phase_bit;
263*4d6fc14bSjoerg    }
264*4d6fc14bSjoerg    inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
265*4d6fc14bSjoerg    void wait(arrival_token&& __phase) const
266*4d6fc14bSjoerg    {
267*4d6fc14bSjoerg        auto const __test_fn = [=]() -> bool {
268*4d6fc14bSjoerg            uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire);
269*4d6fc14bSjoerg            return ((__current & __phase_bit) != __phase);
270*4d6fc14bSjoerg        };
271*4d6fc14bSjoerg        __libcpp_thread_poll_with_backoff(__test_fn, __libcpp_timed_backoff_policy());
272*4d6fc14bSjoerg    }
273*4d6fc14bSjoerg    inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
274*4d6fc14bSjoerg    void arrive_and_drop()
275*4d6fc14bSjoerg    {
276*4d6fc14bSjoerg        __phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed);
277*4d6fc14bSjoerg        (void)arrive(1);
278*4d6fc14bSjoerg    }
279*4d6fc14bSjoerg};
280*4d6fc14bSjoerg
281*4d6fc14bSjoerg#endif //_LIBCPP_HAS_NO_TREE_BARRIER
282*4d6fc14bSjoerg
283*4d6fc14bSjoergtemplate<class _CompletionF = __empty_completion>
284*4d6fc14bSjoergclass barrier {
285*4d6fc14bSjoerg
286*4d6fc14bSjoerg    __barrier_base<_CompletionF> __b;
287*4d6fc14bSjoergpublic:
288*4d6fc14bSjoerg    using arrival_token = typename __barrier_base<_CompletionF>::arrival_token;
289*4d6fc14bSjoerg
290*4d6fc14bSjoerg    static constexpr ptrdiff_t max() noexcept {
291*4d6fc14bSjoerg        return __barrier_base<_CompletionF>::max();
292*4d6fc14bSjoerg    }
293*4d6fc14bSjoerg
294*4d6fc14bSjoerg    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
295*4d6fc14bSjoerg    barrier(ptrdiff_t __count, _CompletionF __completion = _CompletionF())
296*4d6fc14bSjoerg        : __b(__count, _VSTD::move(__completion)) {
297*4d6fc14bSjoerg    }
298*4d6fc14bSjoerg
299*4d6fc14bSjoerg    barrier(barrier const&) = delete;
300*4d6fc14bSjoerg    barrier& operator=(barrier const&) = delete;
301*4d6fc14bSjoerg
302*4d6fc14bSjoerg    [[nodiscard]] _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
303*4d6fc14bSjoerg    arrival_token arrive(ptrdiff_t update = 1)
304*4d6fc14bSjoerg    {
305*4d6fc14bSjoerg        return __b.arrive(update);
306*4d6fc14bSjoerg    }
307*4d6fc14bSjoerg    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
308*4d6fc14bSjoerg    void wait(arrival_token&& __phase) const
309*4d6fc14bSjoerg    {
310*4d6fc14bSjoerg        __b.wait(_VSTD::move(__phase));
311*4d6fc14bSjoerg    }
312*4d6fc14bSjoerg    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
313*4d6fc14bSjoerg    void arrive_and_wait()
314*4d6fc14bSjoerg    {
315*4d6fc14bSjoerg        wait(arrive());
316*4d6fc14bSjoerg    }
317*4d6fc14bSjoerg    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
318*4d6fc14bSjoerg    void arrive_and_drop()
319*4d6fc14bSjoerg    {
320*4d6fc14bSjoerg        __b.arrive_and_drop();
321*4d6fc14bSjoerg    }
322*4d6fc14bSjoerg};
323*4d6fc14bSjoerg
324*4d6fc14bSjoerg_LIBCPP_END_NAMESPACE_STD
325*4d6fc14bSjoerg
326*4d6fc14bSjoerg#endif // _LIBCPP_STD_VER >= 14
327*4d6fc14bSjoerg
328*4d6fc14bSjoerg_LIBCPP_POP_MACROS
329*4d6fc14bSjoerg
330*4d6fc14bSjoerg#endif //_LIBCPP_BARRIER
331