1*4d6fc14bSjoerg// -*- C++ -*- 2*4d6fc14bSjoerg//===--------------------------- barrier ----------------------------------===// 3*4d6fc14bSjoerg// 4*4d6fc14bSjoerg// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5*4d6fc14bSjoerg// See https://llvm.org/LICENSE.txt for license information. 6*4d6fc14bSjoerg// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7*4d6fc14bSjoerg// 8*4d6fc14bSjoerg//===----------------------------------------------------------------------===// 9*4d6fc14bSjoerg 10*4d6fc14bSjoerg#ifndef _LIBCPP_BARRIER 11*4d6fc14bSjoerg#define _LIBCPP_BARRIER 12*4d6fc14bSjoerg 13*4d6fc14bSjoerg/* 14*4d6fc14bSjoerg barrier synopsis 15*4d6fc14bSjoerg 16*4d6fc14bSjoergnamespace std 17*4d6fc14bSjoerg{ 18*4d6fc14bSjoerg 19*4d6fc14bSjoerg template<class CompletionFunction = see below> 20*4d6fc14bSjoerg class barrier 21*4d6fc14bSjoerg { 22*4d6fc14bSjoerg public: 23*4d6fc14bSjoerg using arrival_token = see below; 24*4d6fc14bSjoerg 25*4d6fc14bSjoerg static constexpr ptrdiff_t max() noexcept; 26*4d6fc14bSjoerg 27*4d6fc14bSjoerg constexpr explicit barrier(ptrdiff_t phase_count, 28*4d6fc14bSjoerg CompletionFunction f = CompletionFunction()); 29*4d6fc14bSjoerg ~barrier(); 30*4d6fc14bSjoerg 31*4d6fc14bSjoerg barrier(const barrier&) = delete; 32*4d6fc14bSjoerg barrier& operator=(const barrier&) = delete; 33*4d6fc14bSjoerg 34*4d6fc14bSjoerg [[nodiscard]] arrival_token arrive(ptrdiff_t update = 1); 35*4d6fc14bSjoerg void wait(arrival_token&& arrival) const; 36*4d6fc14bSjoerg 37*4d6fc14bSjoerg void arrive_and_wait(); 38*4d6fc14bSjoerg void arrive_and_drop(); 39*4d6fc14bSjoerg 40*4d6fc14bSjoerg private: 41*4d6fc14bSjoerg CompletionFunction completion; // exposition only 42*4d6fc14bSjoerg }; 43*4d6fc14bSjoerg 44*4d6fc14bSjoerg} 45*4d6fc14bSjoerg 46*4d6fc14bSjoerg*/ 47*4d6fc14bSjoerg 48*4d6fc14bSjoerg#include <__config> 49*4d6fc14bSjoerg#include <__availability> 50*4d6fc14bSjoerg#include <atomic> 51*4d6fc14bSjoerg#ifndef _LIBCPP_HAS_NO_TREE_BARRIER 52*4d6fc14bSjoerg# include <memory> 53*4d6fc14bSjoerg#endif 54*4d6fc14bSjoerg 55*4d6fc14bSjoerg#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) 56*4d6fc14bSjoerg#pragma GCC system_header 57*4d6fc14bSjoerg#endif 58*4d6fc14bSjoerg 59*4d6fc14bSjoerg#ifdef _LIBCPP_HAS_NO_THREADS 60*4d6fc14bSjoerg# error <barrier> is not supported on this single threaded system 61*4d6fc14bSjoerg#endif 62*4d6fc14bSjoerg 63*4d6fc14bSjoerg_LIBCPP_PUSH_MACROS 64*4d6fc14bSjoerg#include <__undef_macros> 65*4d6fc14bSjoerg 66*4d6fc14bSjoerg#if _LIBCPP_STD_VER >= 14 67*4d6fc14bSjoerg 68*4d6fc14bSjoerg_LIBCPP_BEGIN_NAMESPACE_STD 69*4d6fc14bSjoerg 70*4d6fc14bSjoergstruct __empty_completion 71*4d6fc14bSjoerg{ 72*4d6fc14bSjoerg inline _LIBCPP_INLINE_VISIBILITY 73*4d6fc14bSjoerg void operator()() noexcept 74*4d6fc14bSjoerg { 75*4d6fc14bSjoerg } 76*4d6fc14bSjoerg}; 77*4d6fc14bSjoerg 78*4d6fc14bSjoerg#ifndef _LIBCPP_HAS_NO_TREE_BARRIER 79*4d6fc14bSjoerg 80*4d6fc14bSjoerg/* 81*4d6fc14bSjoerg 82*4d6fc14bSjoergThe default implementation of __barrier_base is a classic tree barrier. 83*4d6fc14bSjoerg 84*4d6fc14bSjoergIt looks different from literature pseudocode for two main reasons: 85*4d6fc14bSjoerg 1. Threads that call into std::barrier functions do not provide indices, 86*4d6fc14bSjoerg so a numbering step is added before the actual barrier algorithm, 87*4d6fc14bSjoerg appearing as an N+1 round to the N rounds of the tree barrier. 88*4d6fc14bSjoerg 2. A great deal of attention has been paid to avoid cache line thrashing 89*4d6fc14bSjoerg by flattening the tree structure into cache-line sized arrays, that 90*4d6fc14bSjoerg are indexed in an efficient way. 91*4d6fc14bSjoerg 92*4d6fc14bSjoerg*/ 93*4d6fc14bSjoerg 94*4d6fc14bSjoergusing __barrier_phase_t = uint8_t; 95*4d6fc14bSjoerg 96*4d6fc14bSjoergclass __barrier_algorithm_base; 97*4d6fc14bSjoerg 98*4d6fc14bSjoerg_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI 99*4d6fc14bSjoerg__barrier_algorithm_base* __construct_barrier_algorithm_base(ptrdiff_t& __expected); 100*4d6fc14bSjoerg 101*4d6fc14bSjoerg_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI 102*4d6fc14bSjoergbool __arrive_barrier_algorithm_base(__barrier_algorithm_base* __barrier, 103*4d6fc14bSjoerg __barrier_phase_t __old_phase); 104*4d6fc14bSjoerg 105*4d6fc14bSjoerg_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI 106*4d6fc14bSjoergvoid __destroy_barrier_algorithm_base(__barrier_algorithm_base* __barrier); 107*4d6fc14bSjoerg 108*4d6fc14bSjoergtemplate<class _CompletionF> 109*4d6fc14bSjoergclass __barrier_base { 110*4d6fc14bSjoerg 111*4d6fc14bSjoerg ptrdiff_t __expected; 112*4d6fc14bSjoerg unique_ptr<__barrier_algorithm_base, 113*4d6fc14bSjoerg void (*)(__barrier_algorithm_base*)> __base; 114*4d6fc14bSjoerg __atomic_base<ptrdiff_t> __expected_adjustment; 115*4d6fc14bSjoerg _CompletionF __completion; 116*4d6fc14bSjoerg __atomic_base<__barrier_phase_t> __phase; 117*4d6fc14bSjoerg 118*4d6fc14bSjoergpublic: 119*4d6fc14bSjoerg using arrival_token = __barrier_phase_t; 120*4d6fc14bSjoerg 121*4d6fc14bSjoerg static constexpr ptrdiff_t max() noexcept { 122*4d6fc14bSjoerg return numeric_limits<ptrdiff_t>::max(); 123*4d6fc14bSjoerg } 124*4d6fc14bSjoerg 125*4d6fc14bSjoerg _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 126*4d6fc14bSjoerg __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF()) 127*4d6fc14bSjoerg : __expected(__expected), __base(__construct_barrier_algorithm_base(this->__expected), 128*4d6fc14bSjoerg &__destroy_barrier_algorithm_base), 129*4d6fc14bSjoerg __expected_adjustment(0), __completion(move(__completion)), __phase(0) 130*4d6fc14bSjoerg { 131*4d6fc14bSjoerg } 132*4d6fc14bSjoerg [[nodiscard]] _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 133*4d6fc14bSjoerg arrival_token arrive(ptrdiff_t update) 134*4d6fc14bSjoerg { 135*4d6fc14bSjoerg auto const __old_phase = __phase.load(memory_order_relaxed); 136*4d6fc14bSjoerg for(; update; --update) 137*4d6fc14bSjoerg if(__arrive_barrier_algorithm_base(__base.get(), __old_phase)) { 138*4d6fc14bSjoerg __completion(); 139*4d6fc14bSjoerg __expected += __expected_adjustment.load(memory_order_relaxed); 140*4d6fc14bSjoerg __expected_adjustment.store(0, memory_order_relaxed); 141*4d6fc14bSjoerg __phase.store(__old_phase + 2, memory_order_release); 142*4d6fc14bSjoerg __phase.notify_all(); 143*4d6fc14bSjoerg } 144*4d6fc14bSjoerg return __old_phase; 145*4d6fc14bSjoerg } 146*4d6fc14bSjoerg _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 147*4d6fc14bSjoerg void wait(arrival_token&& __old_phase) const 148*4d6fc14bSjoerg { 149*4d6fc14bSjoerg auto const __test_fn = [=]() -> bool { 150*4d6fc14bSjoerg return __phase.load(memory_order_acquire) != __old_phase; 151*4d6fc14bSjoerg }; 152*4d6fc14bSjoerg __libcpp_thread_poll_with_backoff(__test_fn, __libcpp_timed_backoff_policy()); 153*4d6fc14bSjoerg } 154*4d6fc14bSjoerg _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 155*4d6fc14bSjoerg void arrive_and_drop() 156*4d6fc14bSjoerg { 157*4d6fc14bSjoerg __expected_adjustment.fetch_sub(1, memory_order_relaxed); 158*4d6fc14bSjoerg (void)arrive(1); 159*4d6fc14bSjoerg } 160*4d6fc14bSjoerg}; 161*4d6fc14bSjoerg 162*4d6fc14bSjoerg#else 163*4d6fc14bSjoerg 164*4d6fc14bSjoerg/* 165*4d6fc14bSjoerg 166*4d6fc14bSjoergThe alternative implementation of __barrier_base is a central barrier. 167*4d6fc14bSjoerg 168*4d6fc14bSjoergTwo versions of this algorithm are provided: 169*4d6fc14bSjoerg 1. A fairly straightforward implementation of the litterature for the 170*4d6fc14bSjoerg general case where the completion function is not empty. 171*4d6fc14bSjoerg 2. An optimized implementation that exploits 2's complement arithmetic 172*4d6fc14bSjoerg and well-defined overflow in atomic arithmetic, to handle the phase 173*4d6fc14bSjoerg roll-over for free. 174*4d6fc14bSjoerg 175*4d6fc14bSjoerg*/ 176*4d6fc14bSjoerg 177*4d6fc14bSjoergtemplate<class _CompletionF> 178*4d6fc14bSjoergclass __barrier_base { 179*4d6fc14bSjoerg 180*4d6fc14bSjoerg __atomic_base<ptrdiff_t> __expected; 181*4d6fc14bSjoerg __atomic_base<ptrdiff_t> __arrived; 182*4d6fc14bSjoerg _CompletionF __completion; 183*4d6fc14bSjoerg __atomic_base<bool> __phase; 184*4d6fc14bSjoergpublic: 185*4d6fc14bSjoerg using arrival_token = bool; 186*4d6fc14bSjoerg 187*4d6fc14bSjoerg static constexpr ptrdiff_t max() noexcept { 188*4d6fc14bSjoerg return numeric_limits<ptrdiff_t>::max(); 189*4d6fc14bSjoerg } 190*4d6fc14bSjoerg 191*4d6fc14bSjoerg _LIBCPP_INLINE_VISIBILITY 192*4d6fc14bSjoerg __barrier_base(ptrdiff_t __expected, _CompletionF __completion = _CompletionF()) 193*4d6fc14bSjoerg : __expected(__expected), __arrived(__expected), __completion(move(__completion)), __phase(false) 194*4d6fc14bSjoerg { 195*4d6fc14bSjoerg } 196*4d6fc14bSjoerg [[nodiscard]] _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 197*4d6fc14bSjoerg arrival_token arrive(ptrdiff_t update) 198*4d6fc14bSjoerg { 199*4d6fc14bSjoerg auto const __old_phase = __phase.load(memory_order_relaxed); 200*4d6fc14bSjoerg auto const __result = __arrived.fetch_sub(update, memory_order_acq_rel) - update; 201*4d6fc14bSjoerg auto const new_expected = __expected.load(memory_order_relaxed); 202*4d6fc14bSjoerg if(0 == __result) { 203*4d6fc14bSjoerg __completion(); 204*4d6fc14bSjoerg __arrived.store(new_expected, memory_order_relaxed); 205*4d6fc14bSjoerg __phase.store(!__old_phase, memory_order_release); 206*4d6fc14bSjoerg __phase.notify_all(); 207*4d6fc14bSjoerg } 208*4d6fc14bSjoerg return __old_phase; 209*4d6fc14bSjoerg } 210*4d6fc14bSjoerg _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 211*4d6fc14bSjoerg void wait(arrival_token&& __old_phase) const 212*4d6fc14bSjoerg { 213*4d6fc14bSjoerg __phase.wait(__old_phase, memory_order_acquire); 214*4d6fc14bSjoerg } 215*4d6fc14bSjoerg _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 216*4d6fc14bSjoerg void arrive_and_drop() 217*4d6fc14bSjoerg { 218*4d6fc14bSjoerg __expected.fetch_sub(1, memory_order_relaxed); 219*4d6fc14bSjoerg (void)arrive(1); 220*4d6fc14bSjoerg } 221*4d6fc14bSjoerg}; 222*4d6fc14bSjoerg 223*4d6fc14bSjoergtemplate<> 224*4d6fc14bSjoergclass __barrier_base<__empty_completion> { 225*4d6fc14bSjoerg 226*4d6fc14bSjoerg static constexpr uint64_t __expected_unit = 1ull; 227*4d6fc14bSjoerg static constexpr uint64_t __arrived_unit = 1ull << 32; 228*4d6fc14bSjoerg static constexpr uint64_t __expected_mask = __arrived_unit - 1; 229*4d6fc14bSjoerg static constexpr uint64_t __phase_bit = 1ull << 63; 230*4d6fc14bSjoerg static constexpr uint64_t __arrived_mask = (__phase_bit - 1) & ~__expected_mask; 231*4d6fc14bSjoerg 232*4d6fc14bSjoerg __atomic_base<uint64_t> __phase_arrived_expected; 233*4d6fc14bSjoerg 234*4d6fc14bSjoerg static _LIBCPP_INLINE_VISIBILITY 235*4d6fc14bSjoerg constexpr uint64_t __init(ptrdiff_t __count) _NOEXCEPT 236*4d6fc14bSjoerg { 237*4d6fc14bSjoerg return ((uint64_t(1u << 31) - __count) << 32) 238*4d6fc14bSjoerg | (uint64_t(1u << 31) - __count); 239*4d6fc14bSjoerg } 240*4d6fc14bSjoerg 241*4d6fc14bSjoergpublic: 242*4d6fc14bSjoerg using arrival_token = uint64_t; 243*4d6fc14bSjoerg 244*4d6fc14bSjoerg static constexpr ptrdiff_t max() noexcept { 245*4d6fc14bSjoerg return ptrdiff_t(1u << 31) - 1; 246*4d6fc14bSjoerg } 247*4d6fc14bSjoerg 248*4d6fc14bSjoerg _LIBCPP_INLINE_VISIBILITY 249*4d6fc14bSjoerg explicit inline __barrier_base(ptrdiff_t __count, __empty_completion = __empty_completion()) 250*4d6fc14bSjoerg : __phase_arrived_expected(__init(__count)) 251*4d6fc14bSjoerg { 252*4d6fc14bSjoerg } 253*4d6fc14bSjoerg [[nodiscard]] inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 254*4d6fc14bSjoerg arrival_token arrive(ptrdiff_t update) 255*4d6fc14bSjoerg { 256*4d6fc14bSjoerg auto const __inc = __arrived_unit * update; 257*4d6fc14bSjoerg auto const __old = __phase_arrived_expected.fetch_add(__inc, memory_order_acq_rel); 258*4d6fc14bSjoerg if((__old ^ (__old + __inc)) & __phase_bit) { 259*4d6fc14bSjoerg __phase_arrived_expected.fetch_add((__old & __expected_mask) << 32, memory_order_relaxed); 260*4d6fc14bSjoerg __phase_arrived_expected.notify_all(); 261*4d6fc14bSjoerg } 262*4d6fc14bSjoerg return __old & __phase_bit; 263*4d6fc14bSjoerg } 264*4d6fc14bSjoerg inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 265*4d6fc14bSjoerg void wait(arrival_token&& __phase) const 266*4d6fc14bSjoerg { 267*4d6fc14bSjoerg auto const __test_fn = [=]() -> bool { 268*4d6fc14bSjoerg uint64_t const __current = __phase_arrived_expected.load(memory_order_acquire); 269*4d6fc14bSjoerg return ((__current & __phase_bit) != __phase); 270*4d6fc14bSjoerg }; 271*4d6fc14bSjoerg __libcpp_thread_poll_with_backoff(__test_fn, __libcpp_timed_backoff_policy()); 272*4d6fc14bSjoerg } 273*4d6fc14bSjoerg inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 274*4d6fc14bSjoerg void arrive_and_drop() 275*4d6fc14bSjoerg { 276*4d6fc14bSjoerg __phase_arrived_expected.fetch_add(__expected_unit, memory_order_relaxed); 277*4d6fc14bSjoerg (void)arrive(1); 278*4d6fc14bSjoerg } 279*4d6fc14bSjoerg}; 280*4d6fc14bSjoerg 281*4d6fc14bSjoerg#endif //_LIBCPP_HAS_NO_TREE_BARRIER 282*4d6fc14bSjoerg 283*4d6fc14bSjoergtemplate<class _CompletionF = __empty_completion> 284*4d6fc14bSjoergclass barrier { 285*4d6fc14bSjoerg 286*4d6fc14bSjoerg __barrier_base<_CompletionF> __b; 287*4d6fc14bSjoergpublic: 288*4d6fc14bSjoerg using arrival_token = typename __barrier_base<_CompletionF>::arrival_token; 289*4d6fc14bSjoerg 290*4d6fc14bSjoerg static constexpr ptrdiff_t max() noexcept { 291*4d6fc14bSjoerg return __barrier_base<_CompletionF>::max(); 292*4d6fc14bSjoerg } 293*4d6fc14bSjoerg 294*4d6fc14bSjoerg _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 295*4d6fc14bSjoerg barrier(ptrdiff_t __count, _CompletionF __completion = _CompletionF()) 296*4d6fc14bSjoerg : __b(__count, _VSTD::move(__completion)) { 297*4d6fc14bSjoerg } 298*4d6fc14bSjoerg 299*4d6fc14bSjoerg barrier(barrier const&) = delete; 300*4d6fc14bSjoerg barrier& operator=(barrier const&) = delete; 301*4d6fc14bSjoerg 302*4d6fc14bSjoerg [[nodiscard]] _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 303*4d6fc14bSjoerg arrival_token arrive(ptrdiff_t update = 1) 304*4d6fc14bSjoerg { 305*4d6fc14bSjoerg return __b.arrive(update); 306*4d6fc14bSjoerg } 307*4d6fc14bSjoerg _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 308*4d6fc14bSjoerg void wait(arrival_token&& __phase) const 309*4d6fc14bSjoerg { 310*4d6fc14bSjoerg __b.wait(_VSTD::move(__phase)); 311*4d6fc14bSjoerg } 312*4d6fc14bSjoerg _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 313*4d6fc14bSjoerg void arrive_and_wait() 314*4d6fc14bSjoerg { 315*4d6fc14bSjoerg wait(arrive()); 316*4d6fc14bSjoerg } 317*4d6fc14bSjoerg _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY 318*4d6fc14bSjoerg void arrive_and_drop() 319*4d6fc14bSjoerg { 320*4d6fc14bSjoerg __b.arrive_and_drop(); 321*4d6fc14bSjoerg } 322*4d6fc14bSjoerg}; 323*4d6fc14bSjoerg 324*4d6fc14bSjoerg_LIBCPP_END_NAMESPACE_STD 325*4d6fc14bSjoerg 326*4d6fc14bSjoerg#endif // _LIBCPP_STD_VER >= 14 327*4d6fc14bSjoerg 328*4d6fc14bSjoerg_LIBCPP_POP_MACROS 329*4d6fc14bSjoerg 330*4d6fc14bSjoerg#endif //_LIBCPP_BARRIER 331