1*b1e83836Smrg// <barrier> -*- C++ -*- 2*b1e83836Smrg 3*b1e83836Smrg// Copyright (C) 2020-2022 Free Software Foundation, Inc. 4*b1e83836Smrg// 5*b1e83836Smrg// This file is part of the GNU ISO C++ Library. This library is free 6*b1e83836Smrg// software; you can redistribute it and/or modify it under the 7*b1e83836Smrg// terms of the GNU General Public License as published by the 8*b1e83836Smrg// Free Software Foundation; either version 3, or (at your option) 9*b1e83836Smrg// any later version. 10*b1e83836Smrg 11*b1e83836Smrg// This library is distributed in the hope that it will be useful, 12*b1e83836Smrg// but WITHOUT ANY WARRANTY; without even the implied warranty of 13*b1e83836Smrg// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14*b1e83836Smrg// GNU General Public License for more details. 15*b1e83836Smrg 16*b1e83836Smrg// Under Section 7 of GPL version 3, you are granted additional 17*b1e83836Smrg// permissions described in the GCC Runtime Library Exception, version 18*b1e83836Smrg// 3.1, as published by the Free Software Foundation. 19*b1e83836Smrg 20*b1e83836Smrg// You should have received a copy of the GNU General Public License and 21*b1e83836Smrg// a copy of the GCC Runtime Library Exception along with this program; 22*b1e83836Smrg// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23*b1e83836Smrg// <http://www.gnu.org/licenses/>. 24*b1e83836Smrg 25*b1e83836Smrg// This implementation is based on libcxx/include/barrier 26*b1e83836Smrg//===-- barrier.h --------------------------------------------------===// 27*b1e83836Smrg// 28*b1e83836Smrg// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 29*b1e83836Smrg// See https://llvm.org/LICENSE.txt for license information. 30*b1e83836Smrg// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 31*b1e83836Smrg// 32*b1e83836Smrg//===---------------------------------------------------------------===// 33*b1e83836Smrg 34*b1e83836Smrg/** @file include/barrier 35*b1e83836Smrg * This is a Standard C++ Library header. 36*b1e83836Smrg */ 37*b1e83836Smrg 38*b1e83836Smrg#ifndef _GLIBCXX_BARRIER 39*b1e83836Smrg#define _GLIBCXX_BARRIER 1 40*b1e83836Smrg 41*b1e83836Smrg#pragma GCC system_header 42*b1e83836Smrg 43*b1e83836Smrg#if __cplusplus > 201703L 44*b1e83836Smrg#include <bits/atomic_base.h> 45*b1e83836Smrg#if __cpp_lib_atomic_wait && __cpp_aligned_new 46*b1e83836Smrg#include <bits/std_thread.h> 47*b1e83836Smrg#include <bits/unique_ptr.h> 48*b1e83836Smrg 49*b1e83836Smrg#include <array> 50*b1e83836Smrg 51*b1e83836Smrg#define __cpp_lib_barrier 201907L 52*b1e83836Smrg 53*b1e83836Smrgnamespace std _GLIBCXX_VISIBILITY(default) 54*b1e83836Smrg{ 55*b1e83836Smrg_GLIBCXX_BEGIN_NAMESPACE_VERSION 56*b1e83836Smrg 57*b1e83836Smrg struct __empty_completion 58*b1e83836Smrg { 59*b1e83836Smrg _GLIBCXX_ALWAYS_INLINE void 60*b1e83836Smrg operator()() noexcept 61*b1e83836Smrg { } 62*b1e83836Smrg }; 63*b1e83836Smrg 64*b1e83836Smrg/* 65*b1e83836Smrg 66*b1e83836SmrgThe default implementation of __tree_barrier is a classic tree barrier. 67*b1e83836Smrg 68*b1e83836SmrgIt looks different from literature pseudocode for two main reasons: 69*b1e83836Smrg 1. Threads that call into std::barrier functions do not provide indices, 70*b1e83836Smrg so a numbering step is added before the actual barrier algorithm, 71*b1e83836Smrg appearing as an N+1 round to the N rounds of the tree barrier. 72*b1e83836Smrg 2. A great deal of attention has been paid to avoid cache line thrashing 73*b1e83836Smrg by flattening the tree structure into cache-line sized arrays, that 74*b1e83836Smrg are indexed in an efficient way. 75*b1e83836Smrg 76*b1e83836Smrg*/ 77*b1e83836Smrg 78*b1e83836Smrg enum class __barrier_phase_t : unsigned char { }; 79*b1e83836Smrg 80*b1e83836Smrg template<typename _CompletionF> 81*b1e83836Smrg class __tree_barrier 82*b1e83836Smrg { 83*b1e83836Smrg using __atomic_phase_ref_t = std::__atomic_ref<__barrier_phase_t>; 84*b1e83836Smrg using __atomic_phase_const_ref_t = std::__atomic_ref<const __barrier_phase_t>; 85*b1e83836Smrg static constexpr auto __phase_alignment = 86*b1e83836Smrg __atomic_phase_ref_t::required_alignment; 87*b1e83836Smrg 88*b1e83836Smrg using __tickets_t = std::array<__barrier_phase_t, 64>; 89*b1e83836Smrg struct alignas(64) /* naturally-align the heap state */ __state_t 90*b1e83836Smrg { 91*b1e83836Smrg alignas(__phase_alignment) __tickets_t __tickets; 92*b1e83836Smrg }; 93*b1e83836Smrg 94*b1e83836Smrg ptrdiff_t _M_expected; 95*b1e83836Smrg unique_ptr<__state_t[]> _M_state; 96*b1e83836Smrg __atomic_base<ptrdiff_t> _M_expected_adjustment; 97*b1e83836Smrg _CompletionF _M_completion; 98*b1e83836Smrg 99*b1e83836Smrg alignas(__phase_alignment) __barrier_phase_t _M_phase; 100*b1e83836Smrg 101*b1e83836Smrg bool 102*b1e83836Smrg _M_arrive(__barrier_phase_t __old_phase, size_t __current) 103*b1e83836Smrg { 104*b1e83836Smrg const auto __old_phase_val = static_cast<unsigned char>(__old_phase); 105*b1e83836Smrg const auto __half_step = 106*b1e83836Smrg static_cast<__barrier_phase_t>(__old_phase_val + 1); 107*b1e83836Smrg const auto __full_step = 108*b1e83836Smrg static_cast<__barrier_phase_t>(__old_phase_val + 2); 109*b1e83836Smrg 110*b1e83836Smrg size_t __current_expected = _M_expected; 111*b1e83836Smrg __current %= ((_M_expected + 1) >> 1); 112*b1e83836Smrg 113*b1e83836Smrg for (int __round = 0; ; ++__round) 114*b1e83836Smrg { 115*b1e83836Smrg if (__current_expected <= 1) 116*b1e83836Smrg return true; 117*b1e83836Smrg size_t const __end_node = ((__current_expected + 1) >> 1), 118*b1e83836Smrg __last_node = __end_node - 1; 119*b1e83836Smrg for ( ; ; ++__current) 120*b1e83836Smrg { 121*b1e83836Smrg if (__current == __end_node) 122*b1e83836Smrg __current = 0; 123*b1e83836Smrg auto __expect = __old_phase; 124*b1e83836Smrg __atomic_phase_ref_t __phase(_M_state[__current] 125*b1e83836Smrg .__tickets[__round]); 126*b1e83836Smrg if (__current == __last_node && (__current_expected & 1)) 127*b1e83836Smrg { 128*b1e83836Smrg if (__phase.compare_exchange_strong(__expect, __full_step, 129*b1e83836Smrg memory_order_acq_rel)) 130*b1e83836Smrg break; // I'm 1 in 1, go to next __round 131*b1e83836Smrg } 132*b1e83836Smrg else if (__phase.compare_exchange_strong(__expect, __half_step, 133*b1e83836Smrg memory_order_acq_rel)) 134*b1e83836Smrg { 135*b1e83836Smrg return false; // I'm 1 in 2, done with arrival 136*b1e83836Smrg } 137*b1e83836Smrg else if (__expect == __half_step) 138*b1e83836Smrg { 139*b1e83836Smrg if (__phase.compare_exchange_strong(__expect, __full_step, 140*b1e83836Smrg memory_order_acq_rel)) 141*b1e83836Smrg break; // I'm 2 in 2, go to next __round 142*b1e83836Smrg } 143*b1e83836Smrg } 144*b1e83836Smrg __current_expected = __last_node + 1; 145*b1e83836Smrg __current >>= 1; 146*b1e83836Smrg } 147*b1e83836Smrg } 148*b1e83836Smrg 149*b1e83836Smrg public: 150*b1e83836Smrg using arrival_token = __barrier_phase_t; 151*b1e83836Smrg 152*b1e83836Smrg static constexpr ptrdiff_t 153*b1e83836Smrg max() noexcept 154*b1e83836Smrg { return __PTRDIFF_MAX__; } 155*b1e83836Smrg 156*b1e83836Smrg __tree_barrier(ptrdiff_t __expected, _CompletionF __completion) 157*b1e83836Smrg : _M_expected(__expected), _M_expected_adjustment(0), 158*b1e83836Smrg _M_completion(move(__completion)), 159*b1e83836Smrg _M_phase(static_cast<__barrier_phase_t>(0)) 160*b1e83836Smrg { 161*b1e83836Smrg size_t const __count = (_M_expected + 1) >> 1; 162*b1e83836Smrg 163*b1e83836Smrg _M_state = std::make_unique<__state_t[]>(__count); 164*b1e83836Smrg } 165*b1e83836Smrg 166*b1e83836Smrg [[nodiscard]] arrival_token 167*b1e83836Smrg arrive(ptrdiff_t __update) 168*b1e83836Smrg { 169*b1e83836Smrg std::hash<std::thread::id> __hasher; 170*b1e83836Smrg size_t __current = __hasher(std::this_thread::get_id()); 171*b1e83836Smrg __atomic_phase_ref_t __phase(_M_phase); 172*b1e83836Smrg const auto __old_phase = __phase.load(memory_order_relaxed); 173*b1e83836Smrg const auto __cur = static_cast<unsigned char>(__old_phase); 174*b1e83836Smrg for(; __update; --__update) 175*b1e83836Smrg { 176*b1e83836Smrg if(_M_arrive(__old_phase, __current)) 177*b1e83836Smrg { 178*b1e83836Smrg _M_completion(); 179*b1e83836Smrg _M_expected += _M_expected_adjustment.load(memory_order_relaxed); 180*b1e83836Smrg _M_expected_adjustment.store(0, memory_order_relaxed); 181*b1e83836Smrg auto __new_phase = static_cast<__barrier_phase_t>(__cur + 2); 182*b1e83836Smrg __phase.store(__new_phase, memory_order_release); 183*b1e83836Smrg __phase.notify_all(); 184*b1e83836Smrg } 185*b1e83836Smrg } 186*b1e83836Smrg return __old_phase; 187*b1e83836Smrg } 188*b1e83836Smrg 189*b1e83836Smrg void 190*b1e83836Smrg wait(arrival_token&& __old_phase) const 191*b1e83836Smrg { 192*b1e83836Smrg __atomic_phase_const_ref_t __phase(_M_phase); 193*b1e83836Smrg auto const __test_fn = [=] 194*b1e83836Smrg { 195*b1e83836Smrg return __phase.load(memory_order_acquire) != __old_phase; 196*b1e83836Smrg }; 197*b1e83836Smrg std::__atomic_wait_address(&_M_phase, __test_fn); 198*b1e83836Smrg } 199*b1e83836Smrg 200*b1e83836Smrg void 201*b1e83836Smrg arrive_and_drop() 202*b1e83836Smrg { 203*b1e83836Smrg _M_expected_adjustment.fetch_sub(1, memory_order_relaxed); 204*b1e83836Smrg (void)arrive(1); 205*b1e83836Smrg } 206*b1e83836Smrg }; 207*b1e83836Smrg 208*b1e83836Smrg template<typename _CompletionF = __empty_completion> 209*b1e83836Smrg class barrier 210*b1e83836Smrg { 211*b1e83836Smrg // Note, we may introduce a "central" barrier algorithm at some point 212*b1e83836Smrg // for more space constrained targets 213*b1e83836Smrg using __algorithm_t = __tree_barrier<_CompletionF>; 214*b1e83836Smrg __algorithm_t _M_b; 215*b1e83836Smrg 216*b1e83836Smrg public: 217*b1e83836Smrg class arrival_token final 218*b1e83836Smrg { 219*b1e83836Smrg public: 220*b1e83836Smrg arrival_token(arrival_token&&) = default; 221*b1e83836Smrg arrival_token& operator=(arrival_token&&) = default; 222*b1e83836Smrg ~arrival_token() = default; 223*b1e83836Smrg 224*b1e83836Smrg private: 225*b1e83836Smrg friend class barrier; 226*b1e83836Smrg using __token = typename __algorithm_t::arrival_token; 227*b1e83836Smrg explicit arrival_token(__token __tok) noexcept : _M_tok(__tok) { } 228*b1e83836Smrg __token _M_tok; 229*b1e83836Smrg }; 230*b1e83836Smrg 231*b1e83836Smrg static constexpr ptrdiff_t 232*b1e83836Smrg max() noexcept 233*b1e83836Smrg { return __algorithm_t::max(); } 234*b1e83836Smrg 235*b1e83836Smrg explicit 236*b1e83836Smrg barrier(ptrdiff_t __count, _CompletionF __completion = _CompletionF()) 237*b1e83836Smrg : _M_b(__count, std::move(__completion)) 238*b1e83836Smrg { } 239*b1e83836Smrg 240*b1e83836Smrg barrier(barrier const&) = delete; 241*b1e83836Smrg barrier& operator=(barrier const&) = delete; 242*b1e83836Smrg 243*b1e83836Smrg [[nodiscard]] arrival_token 244*b1e83836Smrg arrive(ptrdiff_t __update = 1) 245*b1e83836Smrg { return arrival_token{_M_b.arrive(__update)}; } 246*b1e83836Smrg 247*b1e83836Smrg void 248*b1e83836Smrg wait(arrival_token&& __phase) const 249*b1e83836Smrg { _M_b.wait(std::move(__phase._M_tok)); } 250*b1e83836Smrg 251*b1e83836Smrg void 252*b1e83836Smrg arrive_and_wait() 253*b1e83836Smrg { wait(arrive()); } 254*b1e83836Smrg 255*b1e83836Smrg void 256*b1e83836Smrg arrive_and_drop() 257*b1e83836Smrg { _M_b.arrive_and_drop(); } 258*b1e83836Smrg }; 259*b1e83836Smrg 260*b1e83836Smrg_GLIBCXX_END_NAMESPACE_VERSION 261*b1e83836Smrg} // namespace 262*b1e83836Smrg#endif // __cpp_lib_atomic_wait && __cpp_aligned_new 263*b1e83836Smrg#endif // __cplusplus > 201703L 264*b1e83836Smrg#endif // _GLIBCXX_BARRIER 265