1330d8983SJohannes Doerfert //===- Synchronization.h - OpenMP synchronization utilities ------- C++ -*-===// 2330d8983SJohannes Doerfert // 3330d8983SJohannes Doerfert // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4330d8983SJohannes Doerfert // See https://llvm.org/LICENSE.txt for license information. 5330d8983SJohannes Doerfert // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6330d8983SJohannes Doerfert // 7330d8983SJohannes Doerfert //===----------------------------------------------------------------------===// 8330d8983SJohannes Doerfert // 9330d8983SJohannes Doerfert // 10330d8983SJohannes Doerfert //===----------------------------------------------------------------------===// 11330d8983SJohannes Doerfert 12330d8983SJohannes Doerfert #ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H 13330d8983SJohannes Doerfert #define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H 14330d8983SJohannes Doerfert 1508533a3eSJohannes Doerfert #include "DeviceTypes.h" 16b57c0bacSJoseph Huber #include "DeviceUtils.h" 17b57c0bacSJoseph Huber 18b57c0bacSJoseph Huber #pragma omp begin declare target device_type(nohost) 19330d8983SJohannes Doerfert 20330d8983SJohannes Doerfert namespace ompx { 21330d8983SJohannes Doerfert namespace atomic { 22330d8983SJohannes Doerfert 23330d8983SJohannes Doerfert enum OrderingTy { 24330d8983SJohannes Doerfert relaxed = __ATOMIC_RELAXED, 25330d8983SJohannes Doerfert aquire = __ATOMIC_ACQUIRE, 26330d8983SJohannes Doerfert release = __ATOMIC_RELEASE, 27330d8983SJohannes Doerfert acq_rel = __ATOMIC_ACQ_REL, 28330d8983SJohannes Doerfert seq_cst = __ATOMIC_SEQ_CST, 29330d8983SJohannes Doerfert }; 30330d8983SJohannes Doerfert 31*3274bf6bSJoseph Huber enum MemScopeTy { 32f4ee5a67SJoseph Huber system = __MEMORY_SCOPE_SYSTEM, 33*3274bf6bSJoseph Huber device = __MEMORY_SCOPE_DEVICE, 34f4ee5a67SJoseph Huber workgroup = __MEMORY_SCOPE_WRKGRP, 35f4ee5a67SJoseph Huber wavefront = __MEMORY_SCOPE_WVFRNT, 36f4ee5a67SJoseph Huber single = __MEMORY_SCOPE_SINGLE, 37f4ee5a67SJoseph Huber }; 38f4ee5a67SJoseph Huber 39330d8983SJohannes Doerfert /// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics. 40330d8983SJohannes Doerfert uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering, 41*3274bf6bSJoseph Huber MemScopeTy MemScope = MemScopeTy::device); 42330d8983SJohannes Doerfert 43330d8983SJohannes Doerfert /// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The 44330d8983SJohannes Doerfert /// result is stored in \p *Addr; 45330d8983SJohannes Doerfert /// { 46330d8983SJohannes Doerfert 47b57c0bacSJoseph Huber template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> 48b57c0bacSJoseph Huber bool cas(Ty *Address, V ExpectedV, V DesiredV, atomic::OrderingTy OrderingSucc, 49*3274bf6bSJoseph Huber atomic::OrderingTy OrderingFail, 50*3274bf6bSJoseph Huber MemScopeTy MemScope = MemScopeTy::device) { 51b57c0bacSJoseph Huber return __scoped_atomic_compare_exchange(Address, &ExpectedV, &DesiredV, false, 52*3274bf6bSJoseph Huber OrderingSucc, OrderingFail, MemScope); 53b57c0bacSJoseph Huber } 54330d8983SJohannes Doerfert 55b57c0bacSJoseph Huber template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> 56*3274bf6bSJoseph Huber V add(Ty *Address, V Val, atomic::OrderingTy Ordering, 57*3274bf6bSJoseph Huber MemScopeTy MemScope = MemScopeTy::device) { 58*3274bf6bSJoseph Huber return __scoped_atomic_fetch_add(Address, Val, Ordering, MemScope); 59b57c0bacSJoseph Huber } 60330d8983SJohannes Doerfert 61b57c0bacSJoseph Huber template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> 62*3274bf6bSJoseph Huber V load(Ty *Address, atomic::OrderingTy Ordering, 63*3274bf6bSJoseph Huber MemScopeTy MemScope = MemScopeTy::device) { 64*3274bf6bSJoseph Huber return __scoped_atomic_load_n(Address, Ordering, MemScope); 65b57c0bacSJoseph Huber } 66330d8983SJohannes Doerfert 67b57c0bacSJoseph Huber template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> 68*3274bf6bSJoseph Huber void store(Ty *Address, V Val, atomic::OrderingTy Ordering, 69*3274bf6bSJoseph Huber MemScopeTy MemScope = MemScopeTy::device) { 70*3274bf6bSJoseph Huber __scoped_atomic_store_n(Address, Val, Ordering, MemScope); 71b57c0bacSJoseph Huber } 72330d8983SJohannes Doerfert 73b57c0bacSJoseph Huber template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> 74*3274bf6bSJoseph Huber V mul(Ty *Address, V Val, atomic::OrderingTy Ordering, 75*3274bf6bSJoseph Huber MemScopeTy MemScope = MemScopeTy::device) { 76b57c0bacSJoseph Huber Ty TypedCurrentVal, TypedResultVal, TypedNewVal; 77b57c0bacSJoseph Huber bool Success; 78b57c0bacSJoseph Huber do { 79b57c0bacSJoseph Huber TypedCurrentVal = atomic::load(Address, Ordering); 80b57c0bacSJoseph Huber TypedNewVal = TypedCurrentVal * Val; 81b57c0bacSJoseph Huber Success = atomic::cas(Address, TypedCurrentVal, TypedNewVal, Ordering, 82*3274bf6bSJoseph Huber atomic::relaxed, MemScope); 83b57c0bacSJoseph Huber } while (!Success); 84b57c0bacSJoseph Huber return TypedResultVal; 85b57c0bacSJoseph Huber } 86330d8983SJohannes Doerfert 87b57c0bacSJoseph Huber template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> 88b57c0bacSJoseph Huber utils::enable_if_t<!utils::is_floating_point_v<V>, V> 89*3274bf6bSJoseph Huber max(Ty *Address, V Val, atomic::OrderingTy Ordering, 90*3274bf6bSJoseph Huber MemScopeTy MemScope = MemScopeTy::device) { 91*3274bf6bSJoseph Huber return __scoped_atomic_fetch_max(Address, Val, Ordering, MemScope); 92b57c0bacSJoseph Huber } 93330d8983SJohannes Doerfert 94b57c0bacSJoseph Huber template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> 95b57c0bacSJoseph Huber utils::enable_if_t<utils::is_same_v<V, float>, V> 96*3274bf6bSJoseph Huber max(Ty *Address, V Val, atomic::OrderingTy Ordering, 97*3274bf6bSJoseph Huber MemScopeTy MemScope = MemScopeTy::device) { 98b57c0bacSJoseph Huber if (Val >= 0) 99*3274bf6bSJoseph Huber return utils::bitCast<float>(max( 100*3274bf6bSJoseph Huber (int32_t *)Address, utils::bitCast<int32_t>(Val), Ordering, MemScope)); 101*3274bf6bSJoseph Huber return utils::bitCast<float>(min( 102*3274bf6bSJoseph Huber (uint32_t *)Address, utils::bitCast<uint32_t>(Val), Ordering, MemScope)); 103b57c0bacSJoseph Huber } 104b57c0bacSJoseph Huber 105b57c0bacSJoseph Huber template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> 106b57c0bacSJoseph Huber utils::enable_if_t<utils::is_same_v<V, double>, V> 107*3274bf6bSJoseph Huber max(Ty *Address, V Val, atomic::OrderingTy Ordering, 108*3274bf6bSJoseph Huber MemScopeTy MemScope = MemScopeTy::device) { 109b57c0bacSJoseph Huber if (Val >= 0) 110*3274bf6bSJoseph Huber return utils::bitCast<double>(max( 111*3274bf6bSJoseph Huber (int64_t *)Address, utils::bitCast<int64_t>(Val), Ordering, MemScope)); 112*3274bf6bSJoseph Huber return utils::bitCast<double>(min( 113*3274bf6bSJoseph Huber (uint64_t *)Address, utils::bitCast<uint64_t>(Val), Ordering, MemScope)); 114b57c0bacSJoseph Huber } 115b57c0bacSJoseph Huber 116b57c0bacSJoseph Huber template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> 117b57c0bacSJoseph Huber utils::enable_if_t<!utils::is_floating_point_v<V>, V> 118*3274bf6bSJoseph Huber min(Ty *Address, V Val, atomic::OrderingTy Ordering, 119*3274bf6bSJoseph Huber MemScopeTy MemScope = MemScopeTy::device) { 120*3274bf6bSJoseph Huber return __scoped_atomic_fetch_min(Address, Val, Ordering, MemScope); 121b57c0bacSJoseph Huber } 122b57c0bacSJoseph Huber 123b57c0bacSJoseph Huber // TODO: Implement this with __atomic_fetch_max and remove the duplication. 124b57c0bacSJoseph Huber template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> 125b57c0bacSJoseph Huber utils::enable_if_t<utils::is_same_v<V, float>, V> 126*3274bf6bSJoseph Huber min(Ty *Address, V Val, atomic::OrderingTy Ordering, 127*3274bf6bSJoseph Huber MemScopeTy MemScope = MemScopeTy::device) { 128b57c0bacSJoseph Huber if (Val >= 0) 129*3274bf6bSJoseph Huber return utils::bitCast<float>(min( 130*3274bf6bSJoseph Huber (int32_t *)Address, utils::bitCast<int32_t>(Val), Ordering, MemScope)); 131*3274bf6bSJoseph Huber return utils::bitCast<float>(max( 132*3274bf6bSJoseph Huber (uint32_t *)Address, utils::bitCast<uint32_t>(Val), Ordering, MemScope)); 133b57c0bacSJoseph Huber } 134b57c0bacSJoseph Huber 135b57c0bacSJoseph Huber // TODO: Implement this with __atomic_fetch_max and remove the duplication. 136b57c0bacSJoseph Huber template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> 137b57c0bacSJoseph Huber utils::enable_if_t<utils::is_same_v<V, double>, V> 138*3274bf6bSJoseph Huber min(Ty *Address, utils::remove_addrspace_t<Ty> Val, atomic::OrderingTy Ordering, 139*3274bf6bSJoseph Huber MemScopeTy MemScope = MemScopeTy::device) { 140b57c0bacSJoseph Huber if (Val >= 0) 141*3274bf6bSJoseph Huber return utils::bitCast<double>(min( 142*3274bf6bSJoseph Huber (int64_t *)Address, utils::bitCast<int64_t>(Val), Ordering, MemScope)); 143*3274bf6bSJoseph Huber return utils::bitCast<double>(max( 144*3274bf6bSJoseph Huber (uint64_t *)Address, utils::bitCast<uint64_t>(Val), Ordering, MemScope)); 145b57c0bacSJoseph Huber } 146b57c0bacSJoseph Huber 147b57c0bacSJoseph Huber template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> 148*3274bf6bSJoseph Huber V bit_or(Ty *Address, V Val, atomic::OrderingTy Ordering, 149*3274bf6bSJoseph Huber MemScopeTy MemScope = MemScopeTy::device) { 150*3274bf6bSJoseph Huber return __scoped_atomic_fetch_or(Address, Val, Ordering, MemScope); 151b57c0bacSJoseph Huber } 152b57c0bacSJoseph Huber 153b57c0bacSJoseph Huber template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> 154*3274bf6bSJoseph Huber V bit_and(Ty *Address, V Val, atomic::OrderingTy Ordering, 155*3274bf6bSJoseph Huber MemScopeTy MemScope = MemScopeTy::device) { 156*3274bf6bSJoseph Huber return __scoped_atomic_fetch_and(Address, Val, Ordering, MemScope); 157b57c0bacSJoseph Huber } 158b57c0bacSJoseph Huber 159b57c0bacSJoseph Huber template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> 160*3274bf6bSJoseph Huber V bit_xor(Ty *Address, V Val, atomic::OrderingTy Ordering, 161*3274bf6bSJoseph Huber MemScopeTy MemScope = MemScopeTy::device) { 162*3274bf6bSJoseph Huber return __scoped_atomic_fetch_xor(Address, Val, Ordering, MemScope); 163b57c0bacSJoseph Huber } 164b57c0bacSJoseph Huber 165*3274bf6bSJoseph Huber static inline uint32_t 166*3274bf6bSJoseph Huber atomicExchange(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, 167*3274bf6bSJoseph Huber MemScopeTy MemScope = MemScopeTy::device) { 168b57c0bacSJoseph Huber uint32_t R; 169*3274bf6bSJoseph Huber __scoped_atomic_exchange(Address, &Val, &R, Ordering, MemScope); 170b57c0bacSJoseph Huber return R; 171b57c0bacSJoseph Huber } 172330d8983SJohannes Doerfert 173330d8983SJohannes Doerfert ///} 174330d8983SJohannes Doerfert 175330d8983SJohannes Doerfert } // namespace atomic 176330d8983SJohannes Doerfert 177330d8983SJohannes Doerfert namespace synchronize { 178330d8983SJohannes Doerfert 179330d8983SJohannes Doerfert /// Initialize the synchronization machinery. Must be called by all threads. 180330d8983SJohannes Doerfert void init(bool IsSPMD); 181330d8983SJohannes Doerfert 182330d8983SJohannes Doerfert /// Synchronize all threads in a warp identified by \p Mask. 183330d8983SJohannes Doerfert void warp(LaneMaskTy Mask); 184330d8983SJohannes Doerfert 185330d8983SJohannes Doerfert /// Synchronize all threads in a block and perform a fence before and after the 186330d8983SJohannes Doerfert /// barrier according to \p Ordering. Note that the fence might be part of the 187330d8983SJohannes Doerfert /// barrier. 188330d8983SJohannes Doerfert void threads(atomic::OrderingTy Ordering); 189330d8983SJohannes Doerfert 190330d8983SJohannes Doerfert /// Synchronizing threads is allowed even if they all hit different instances of 191330d8983SJohannes Doerfert /// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more 192330d8983SJohannes Doerfert /// restrictive in that it requires all threads to hit the same instance. The 193330d8983SJohannes Doerfert /// noinline is removed by the openmp-opt pass and helps to preserve the 194330d8983SJohannes Doerfert /// information till then. 195330d8983SJohannes Doerfert ///{ 196330d8983SJohannes Doerfert 197330d8983SJohannes Doerfert /// Synchronize all threads in a block, they are reaching the same instruction 198330d8983SJohannes Doerfert /// (hence all threads in the block are "aligned"). Also perform a fence before 199330d8983SJohannes Doerfert /// and after the barrier according to \p Ordering. Note that the 200330d8983SJohannes Doerfert /// fence might be part of the barrier if the target offers this. 201723a3e74SJoseph Huber [[gnu::noinline, omp::assume("ompx_aligned_barrier")]] void 20258af82b4SJoseph Huber threadsAligned(atomic::OrderingTy Ordering); 203330d8983SJohannes Doerfert 204330d8983SJohannes Doerfert ///} 205330d8983SJohannes Doerfert 206330d8983SJohannes Doerfert } // namespace synchronize 207330d8983SJohannes Doerfert 208330d8983SJohannes Doerfert namespace fence { 209330d8983SJohannes Doerfert 210330d8983SJohannes Doerfert /// Memory fence with \p Ordering semantics for the team. 211330d8983SJohannes Doerfert void team(atomic::OrderingTy Ordering); 212330d8983SJohannes Doerfert 213330d8983SJohannes Doerfert /// Memory fence with \p Ordering semantics for the contention group. 214330d8983SJohannes Doerfert void kernel(atomic::OrderingTy Ordering); 215330d8983SJohannes Doerfert 216330d8983SJohannes Doerfert /// Memory fence with \p Ordering semantics for the system. 217330d8983SJohannes Doerfert void system(atomic::OrderingTy Ordering); 218330d8983SJohannes Doerfert 219330d8983SJohannes Doerfert } // namespace fence 220330d8983SJohannes Doerfert 221330d8983SJohannes Doerfert } // namespace ompx 222330d8983SJohannes Doerfert 223b57c0bacSJoseph Huber #pragma omp end declare target 224b57c0bacSJoseph Huber 225330d8983SJohannes Doerfert #endif 226