10b57cec5SDimitry Andric /* 20b57cec5SDimitry Andric * kmp_barrier.cpp 30b57cec5SDimitry Andric */ 40b57cec5SDimitry Andric 50b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 80b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 90b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 100b57cec5SDimitry Andric // 110b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 120b57cec5SDimitry Andric 130b57cec5SDimitry Andric #include "kmp_wait_release.h" 14349cc55cSDimitry Andric #include "kmp_barrier.h" 150b57cec5SDimitry Andric #include "kmp_itt.h" 160b57cec5SDimitry Andric #include "kmp_os.h" 170b57cec5SDimitry Andric #include "kmp_stats.h" 180b57cec5SDimitry Andric #include "ompt-specific.h" 19349cc55cSDimitry Andric // for distributed barrier 20349cc55cSDimitry Andric #include "kmp_affinity.h" 210b57cec5SDimitry Andric 220b57cec5SDimitry Andric #if KMP_MIC 230b57cec5SDimitry Andric #include <immintrin.h> 240b57cec5SDimitry Andric #define USE_NGO_STORES 1 250b57cec5SDimitry Andric #endif // KMP_MIC 260b57cec5SDimitry Andric 270b57cec5SDimitry Andric #if KMP_MIC && USE_NGO_STORES 280b57cec5SDimitry Andric // ICV copying 290b57cec5SDimitry Andric #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src)) 300b57cec5SDimitry Andric #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 310b57cec5SDimitry Andric #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 320b57cec5SDimitry Andric #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory") 330b57cec5SDimitry Andric #else 340b57cec5SDimitry Andric #define ngo_load(src) ((void)0) 350b57cec5SDimitry Andric #define ngo_store_icvs(dst, src) copy_icvs((dst), (src)) 360b57cec5SDimitry Andric #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE) 370b57cec5SDimitry Andric #define ngo_sync() ((void)0) 380b57cec5SDimitry Andric #endif /* KMP_MIC && USE_NGO_STORES */ 390b57cec5SDimitry Andric 400b57cec5SDimitry Andric void __kmp_print_structure(void); // Forward declaration 410b57cec5SDimitry Andric 420b57cec5SDimitry Andric // ---------------------------- Barrier Algorithms ---------------------------- 43349cc55cSDimitry Andric // Distributed barrier 44349cc55cSDimitry Andric 45349cc55cSDimitry Andric // Compute how many threads to have polling each cache-line. 46349cc55cSDimitry Andric // We want to limit the number of writes to IDEAL_GO_RESOLUTION. 47349cc55cSDimitry Andric void distributedBarrier::computeVarsForN(size_t n) { 48349cc55cSDimitry Andric int nsockets = 1; 49349cc55cSDimitry Andric if (__kmp_topology) { 50349cc55cSDimitry Andric int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET); 51349cc55cSDimitry Andric int core_level = __kmp_topology->get_level(KMP_HW_CORE); 52349cc55cSDimitry Andric int ncores_per_socket = 53349cc55cSDimitry Andric __kmp_topology->calculate_ratio(core_level, socket_level); 54349cc55cSDimitry Andric nsockets = __kmp_topology->get_count(socket_level); 55349cc55cSDimitry Andric 56349cc55cSDimitry Andric if (nsockets <= 0) 57349cc55cSDimitry Andric nsockets = 1; 58349cc55cSDimitry Andric if (ncores_per_socket <= 0) 59349cc55cSDimitry Andric ncores_per_socket = 1; 60349cc55cSDimitry Andric 61349cc55cSDimitry Andric threads_per_go = ncores_per_socket >> 1; 62349cc55cSDimitry Andric if (!fix_threads_per_go) { 63349cc55cSDimitry Andric // Minimize num_gos 64349cc55cSDimitry Andric if (threads_per_go > 4) { 65349cc55cSDimitry Andric if (KMP_OPTIMIZE_FOR_REDUCTIONS) { 66349cc55cSDimitry Andric threads_per_go = threads_per_go >> 1; 67349cc55cSDimitry Andric } 68349cc55cSDimitry Andric if (threads_per_go > 4 && nsockets == 1) 69349cc55cSDimitry Andric threads_per_go = threads_per_go >> 1; 70349cc55cSDimitry Andric } 71349cc55cSDimitry Andric } 72349cc55cSDimitry Andric if (threads_per_go == 0) 73349cc55cSDimitry Andric threads_per_go = 1; 74349cc55cSDimitry Andric fix_threads_per_go = true; 75349cc55cSDimitry Andric num_gos = n / threads_per_go; 76349cc55cSDimitry Andric if (n % threads_per_go) 77349cc55cSDimitry Andric num_gos++; 78349cc55cSDimitry Andric if (nsockets == 1 || num_gos == 1) 79349cc55cSDimitry Andric num_groups = 1; 80349cc55cSDimitry Andric else { 81349cc55cSDimitry Andric num_groups = num_gos / nsockets; 82349cc55cSDimitry Andric if (num_gos % nsockets) 83349cc55cSDimitry Andric num_groups++; 84349cc55cSDimitry Andric } 85349cc55cSDimitry Andric if (num_groups <= 0) 86349cc55cSDimitry Andric num_groups = 1; 87349cc55cSDimitry Andric gos_per_group = num_gos / num_groups; 88349cc55cSDimitry Andric if (num_gos % num_groups) 89349cc55cSDimitry Andric gos_per_group++; 90349cc55cSDimitry Andric threads_per_group = threads_per_go * gos_per_group; 91349cc55cSDimitry Andric } else { 92349cc55cSDimitry Andric num_gos = n / threads_per_go; 93349cc55cSDimitry Andric if (n % threads_per_go) 94349cc55cSDimitry Andric num_gos++; 95349cc55cSDimitry Andric if (num_gos == 1) 96349cc55cSDimitry Andric num_groups = 1; 97349cc55cSDimitry Andric else { 98349cc55cSDimitry Andric num_groups = num_gos / 2; 99349cc55cSDimitry Andric if (num_gos % 2) 100349cc55cSDimitry Andric num_groups++; 101349cc55cSDimitry Andric } 102349cc55cSDimitry Andric gos_per_group = num_gos / num_groups; 103349cc55cSDimitry Andric if (num_gos % num_groups) 104349cc55cSDimitry Andric gos_per_group++; 105349cc55cSDimitry Andric threads_per_group = threads_per_go * gos_per_group; 106349cc55cSDimitry Andric } 107349cc55cSDimitry Andric } 108349cc55cSDimitry Andric 109349cc55cSDimitry Andric void distributedBarrier::computeGo(size_t n) { 110349cc55cSDimitry Andric // Minimize num_gos 111349cc55cSDimitry Andric for (num_gos = 1;; num_gos++) 112349cc55cSDimitry Andric if (IDEAL_CONTENTION * num_gos >= n) 113349cc55cSDimitry Andric break; 114349cc55cSDimitry Andric threads_per_go = n / num_gos; 115349cc55cSDimitry Andric if (n % num_gos) 116349cc55cSDimitry Andric threads_per_go++; 117349cc55cSDimitry Andric while (num_gos > MAX_GOS) { 118349cc55cSDimitry Andric threads_per_go++; 119349cc55cSDimitry Andric num_gos = n / threads_per_go; 120349cc55cSDimitry Andric if (n % threads_per_go) 121349cc55cSDimitry Andric num_gos++; 122349cc55cSDimitry Andric } 123349cc55cSDimitry Andric computeVarsForN(n); 124349cc55cSDimitry Andric } 125349cc55cSDimitry Andric 126349cc55cSDimitry Andric // This function is to resize the barrier arrays when the new number of threads 127349cc55cSDimitry Andric // exceeds max_threads, which is the current size of all the arrays 128349cc55cSDimitry Andric void distributedBarrier::resize(size_t nthr) { 129349cc55cSDimitry Andric KMP_DEBUG_ASSERT(nthr > max_threads); 130349cc55cSDimitry Andric 131349cc55cSDimitry Andric // expand to requested size * 2 132349cc55cSDimitry Andric max_threads = nthr * 2; 133349cc55cSDimitry Andric 134349cc55cSDimitry Andric // allocate arrays to new max threads 135349cc55cSDimitry Andric for (int i = 0; i < MAX_ITERS; ++i) { 136349cc55cSDimitry Andric if (flags[i]) 137349cc55cSDimitry Andric flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i], 138349cc55cSDimitry Andric max_threads * sizeof(flags_s)); 139349cc55cSDimitry Andric else 140349cc55cSDimitry Andric flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s)); 141349cc55cSDimitry Andric } 142349cc55cSDimitry Andric 143349cc55cSDimitry Andric if (go) 144349cc55cSDimitry Andric go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s)); 145349cc55cSDimitry Andric else 146349cc55cSDimitry Andric go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s)); 147349cc55cSDimitry Andric 148349cc55cSDimitry Andric if (iter) 149349cc55cSDimitry Andric iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s)); 150349cc55cSDimitry Andric else 151349cc55cSDimitry Andric iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s)); 152349cc55cSDimitry Andric 153349cc55cSDimitry Andric if (sleep) 154349cc55cSDimitry Andric sleep = 155349cc55cSDimitry Andric (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s)); 156349cc55cSDimitry Andric else 157349cc55cSDimitry Andric sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s)); 158349cc55cSDimitry Andric } 159349cc55cSDimitry Andric 160349cc55cSDimitry Andric // This function is to set all the go flags that threads might be waiting 161349cc55cSDimitry Andric // on, and when blocktime is not infinite, it should be followed by a wake-up 162349cc55cSDimitry Andric // call to each thread 163349cc55cSDimitry Andric kmp_uint64 distributedBarrier::go_release() { 164349cc55cSDimitry Andric kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS; 165349cc55cSDimitry Andric for (size_t j = 0; j < num_gos; j++) { 166349cc55cSDimitry Andric go[j].go.store(next_go); 167349cc55cSDimitry Andric } 168349cc55cSDimitry Andric return next_go; 169349cc55cSDimitry Andric } 170349cc55cSDimitry Andric 171349cc55cSDimitry Andric void distributedBarrier::go_reset() { 172349cc55cSDimitry Andric for (size_t j = 0; j < max_threads; ++j) { 173349cc55cSDimitry Andric for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) { 174349cc55cSDimitry Andric flags[i][j].stillNeed = 1; 175349cc55cSDimitry Andric } 176349cc55cSDimitry Andric go[j].go.store(0); 177349cc55cSDimitry Andric iter[j].iter = 0; 178349cc55cSDimitry Andric } 179349cc55cSDimitry Andric } 180349cc55cSDimitry Andric 181349cc55cSDimitry Andric // This function inits/re-inits the distributed barrier for a particular number 182349cc55cSDimitry Andric // of threads. If a resize of arrays is needed, it calls the resize function. 183349cc55cSDimitry Andric void distributedBarrier::init(size_t nthr) { 184349cc55cSDimitry Andric size_t old_max = max_threads; 185349cc55cSDimitry Andric if (nthr > max_threads) { // need more space in arrays 186349cc55cSDimitry Andric resize(nthr); 187349cc55cSDimitry Andric } 188349cc55cSDimitry Andric 189349cc55cSDimitry Andric for (size_t i = 0; i < max_threads; i++) { 190349cc55cSDimitry Andric for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) { 191349cc55cSDimitry Andric flags[j][i].stillNeed = 1; 192349cc55cSDimitry Andric } 193349cc55cSDimitry Andric go[i].go.store(0); 194349cc55cSDimitry Andric iter[i].iter = 0; 195349cc55cSDimitry Andric if (i >= old_max) 196349cc55cSDimitry Andric sleep[i].sleep = false; 197349cc55cSDimitry Andric } 198349cc55cSDimitry Andric 199349cc55cSDimitry Andric // Recalculate num_gos, etc. based on new nthr 200349cc55cSDimitry Andric computeVarsForN(nthr); 201349cc55cSDimitry Andric 202349cc55cSDimitry Andric num_threads = nthr; 203349cc55cSDimitry Andric 204349cc55cSDimitry Andric if (team_icvs == NULL) 205349cc55cSDimitry Andric team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t)); 206349cc55cSDimitry Andric } 207349cc55cSDimitry Andric 208349cc55cSDimitry Andric // This function is used only when KMP_BLOCKTIME is not infinite. 209349cc55cSDimitry Andric // static 210349cc55cSDimitry Andric void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team, 211349cc55cSDimitry Andric size_t start, size_t stop, size_t inc, 212349cc55cSDimitry Andric size_t tid) { 213349cc55cSDimitry Andric KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME); 214349cc55cSDimitry Andric if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 215349cc55cSDimitry Andric return; 216349cc55cSDimitry Andric 217349cc55cSDimitry Andric kmp_info_t **other_threads = team->t.t_threads; 218349cc55cSDimitry Andric for (size_t thr = start; thr < stop; thr += inc) { 219349cc55cSDimitry Andric KMP_DEBUG_ASSERT(other_threads[thr]); 220349cc55cSDimitry Andric int gtid = other_threads[thr]->th.th_info.ds.ds_gtid; 221349cc55cSDimitry Andric // Wake up worker regardless of if it appears to be sleeping or not 222349cc55cSDimitry Andric __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL); 223349cc55cSDimitry Andric } 224349cc55cSDimitry Andric } 225349cc55cSDimitry Andric 226349cc55cSDimitry Andric static void __kmp_dist_barrier_gather( 227349cc55cSDimitry Andric enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 228349cc55cSDimitry Andric void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 229349cc55cSDimitry Andric KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather); 230349cc55cSDimitry Andric kmp_team_t *team; 231349cc55cSDimitry Andric distributedBarrier *b; 232349cc55cSDimitry Andric kmp_info_t **other_threads; 233349cc55cSDimitry Andric kmp_uint64 my_current_iter, my_next_iter; 234349cc55cSDimitry Andric kmp_uint32 nproc; 235349cc55cSDimitry Andric bool group_leader; 236349cc55cSDimitry Andric 237349cc55cSDimitry Andric team = this_thr->th.th_team; 238349cc55cSDimitry Andric nproc = this_thr->th.th_team_nproc; 239349cc55cSDimitry Andric other_threads = team->t.t_threads; 240349cc55cSDimitry Andric b = team->t.b; 241349cc55cSDimitry Andric my_current_iter = b->iter[tid].iter; 242349cc55cSDimitry Andric my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS; 243349cc55cSDimitry Andric group_leader = ((tid % b->threads_per_group) == 0); 244349cc55cSDimitry Andric 245349cc55cSDimitry Andric KA_TRACE(20, 246349cc55cSDimitry Andric ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n", 247349cc55cSDimitry Andric gtid, team->t.t_id, tid, bt)); 248349cc55cSDimitry Andric 249349cc55cSDimitry Andric #if USE_ITT_BUILD && USE_ITT_NOTIFY 250349cc55cSDimitry Andric // Barrier imbalance - save arrive time to the thread 251349cc55cSDimitry Andric if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 252349cc55cSDimitry Andric this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 253349cc55cSDimitry Andric __itt_get_timestamp(); 254349cc55cSDimitry Andric } 255349cc55cSDimitry Andric #endif 256349cc55cSDimitry Andric 257349cc55cSDimitry Andric if (group_leader) { 258349cc55cSDimitry Andric // Start from the thread after the group leader 259349cc55cSDimitry Andric size_t group_start = tid + 1; 260349cc55cSDimitry Andric size_t group_end = tid + b->threads_per_group; 261349cc55cSDimitry Andric size_t threads_pending = 0; 262349cc55cSDimitry Andric 263349cc55cSDimitry Andric if (group_end > nproc) 264349cc55cSDimitry Andric group_end = nproc; 265349cc55cSDimitry Andric do { // wait for threads in my group 266349cc55cSDimitry Andric threads_pending = 0; 267349cc55cSDimitry Andric // Check all the flags every time to avoid branch misspredict 268349cc55cSDimitry Andric for (size_t thr = group_start; thr < group_end; thr++) { 269349cc55cSDimitry Andric // Each thread uses a different cache line 270349cc55cSDimitry Andric threads_pending += b->flags[my_current_iter][thr].stillNeed; 271349cc55cSDimitry Andric } 272349cc55cSDimitry Andric // Execute tasks here 273349cc55cSDimitry Andric if (__kmp_tasking_mode != tskm_immediate_exec) { 274349cc55cSDimitry Andric kmp_task_team_t *task_team = this_thr->th.th_task_team; 275349cc55cSDimitry Andric if (task_team != NULL) { 276349cc55cSDimitry Andric if (TCR_SYNC_4(task_team->tt.tt_active)) { 277349cc55cSDimitry Andric if (KMP_TASKING_ENABLED(task_team)) { 278349cc55cSDimitry Andric int tasks_completed = FALSE; 279349cc55cSDimitry Andric __kmp_atomic_execute_tasks_64( 280349cc55cSDimitry Andric this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE, 281349cc55cSDimitry Andric &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0); 282349cc55cSDimitry Andric } else 283349cc55cSDimitry Andric this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 284349cc55cSDimitry Andric } 285349cc55cSDimitry Andric } else { 286349cc55cSDimitry Andric this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 287349cc55cSDimitry Andric } // if 288349cc55cSDimitry Andric } 289349cc55cSDimitry Andric if (TCR_4(__kmp_global.g.g_done)) { 290349cc55cSDimitry Andric if (__kmp_global.g.g_abort) 291349cc55cSDimitry Andric __kmp_abort_thread(); 292349cc55cSDimitry Andric break; 293349cc55cSDimitry Andric } else if (__kmp_tasking_mode != tskm_immediate_exec && 294349cc55cSDimitry Andric this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) { 295349cc55cSDimitry Andric this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 296349cc55cSDimitry Andric } 297349cc55cSDimitry Andric } while (threads_pending > 0); 298349cc55cSDimitry Andric 299349cc55cSDimitry Andric if (reduce) { // Perform reduction if needed 300349cc55cSDimitry Andric OMPT_REDUCTION_DECL(this_thr, gtid); 301349cc55cSDimitry Andric OMPT_REDUCTION_BEGIN; 302349cc55cSDimitry Andric // Group leader reduces all threads in group 303349cc55cSDimitry Andric for (size_t thr = group_start; thr < group_end; thr++) { 304349cc55cSDimitry Andric (*reduce)(this_thr->th.th_local.reduce_data, 305349cc55cSDimitry Andric other_threads[thr]->th.th_local.reduce_data); 306349cc55cSDimitry Andric } 307349cc55cSDimitry Andric OMPT_REDUCTION_END; 308349cc55cSDimitry Andric } 309349cc55cSDimitry Andric 310349cc55cSDimitry Andric // Set flag for next iteration 311349cc55cSDimitry Andric b->flags[my_next_iter][tid].stillNeed = 1; 312349cc55cSDimitry Andric // Each thread uses a different cache line; resets stillNeed to 0 to 313349cc55cSDimitry Andric // indicate it has reached the barrier 314349cc55cSDimitry Andric b->flags[my_current_iter][tid].stillNeed = 0; 315349cc55cSDimitry Andric 316349cc55cSDimitry Andric do { // wait for all group leaders 317349cc55cSDimitry Andric threads_pending = 0; 318349cc55cSDimitry Andric for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) { 319349cc55cSDimitry Andric threads_pending += b->flags[my_current_iter][thr].stillNeed; 320349cc55cSDimitry Andric } 321349cc55cSDimitry Andric // Execute tasks here 322349cc55cSDimitry Andric if (__kmp_tasking_mode != tskm_immediate_exec) { 323349cc55cSDimitry Andric kmp_task_team_t *task_team = this_thr->th.th_task_team; 324349cc55cSDimitry Andric if (task_team != NULL) { 325349cc55cSDimitry Andric if (TCR_SYNC_4(task_team->tt.tt_active)) { 326349cc55cSDimitry Andric if (KMP_TASKING_ENABLED(task_team)) { 327349cc55cSDimitry Andric int tasks_completed = FALSE; 328349cc55cSDimitry Andric __kmp_atomic_execute_tasks_64( 329349cc55cSDimitry Andric this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE, 330349cc55cSDimitry Andric &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0); 331349cc55cSDimitry Andric } else 332349cc55cSDimitry Andric this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 333349cc55cSDimitry Andric } 334349cc55cSDimitry Andric } else { 335349cc55cSDimitry Andric this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 336349cc55cSDimitry Andric } // if 337349cc55cSDimitry Andric } 338349cc55cSDimitry Andric if (TCR_4(__kmp_global.g.g_done)) { 339349cc55cSDimitry Andric if (__kmp_global.g.g_abort) 340349cc55cSDimitry Andric __kmp_abort_thread(); 341349cc55cSDimitry Andric break; 342349cc55cSDimitry Andric } else if (__kmp_tasking_mode != tskm_immediate_exec && 343349cc55cSDimitry Andric this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) { 344349cc55cSDimitry Andric this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 345349cc55cSDimitry Andric } 346349cc55cSDimitry Andric } while (threads_pending > 0); 347349cc55cSDimitry Andric 348349cc55cSDimitry Andric if (reduce) { // Perform reduction if needed 349349cc55cSDimitry Andric if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders 350349cc55cSDimitry Andric OMPT_REDUCTION_DECL(this_thr, gtid); 351349cc55cSDimitry Andric OMPT_REDUCTION_BEGIN; 352349cc55cSDimitry Andric for (size_t thr = b->threads_per_group; thr < nproc; 353349cc55cSDimitry Andric thr += b->threads_per_group) { 354349cc55cSDimitry Andric (*reduce)(this_thr->th.th_local.reduce_data, 355349cc55cSDimitry Andric other_threads[thr]->th.th_local.reduce_data); 356349cc55cSDimitry Andric } 357349cc55cSDimitry Andric OMPT_REDUCTION_END; 358349cc55cSDimitry Andric } 359349cc55cSDimitry Andric } 360349cc55cSDimitry Andric } else { 361349cc55cSDimitry Andric // Set flag for next iteration 362349cc55cSDimitry Andric b->flags[my_next_iter][tid].stillNeed = 1; 363349cc55cSDimitry Andric // Each thread uses a different cache line; resets stillNeed to 0 to 364349cc55cSDimitry Andric // indicate it has reached the barrier 365349cc55cSDimitry Andric b->flags[my_current_iter][tid].stillNeed = 0; 366349cc55cSDimitry Andric } 367349cc55cSDimitry Andric 368349cc55cSDimitry Andric KMP_MFENCE(); 369349cc55cSDimitry Andric 370349cc55cSDimitry Andric KA_TRACE(20, 371349cc55cSDimitry Andric ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 372349cc55cSDimitry Andric gtid, team->t.t_id, tid, bt)); 373349cc55cSDimitry Andric } 374349cc55cSDimitry Andric 375349cc55cSDimitry Andric static void __kmp_dist_barrier_release( 376349cc55cSDimitry Andric enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 377349cc55cSDimitry Andric int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 378349cc55cSDimitry Andric KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release); 379349cc55cSDimitry Andric kmp_team_t *team; 380349cc55cSDimitry Andric distributedBarrier *b; 381349cc55cSDimitry Andric kmp_bstate_t *thr_bar; 382349cc55cSDimitry Andric kmp_uint64 my_current_iter, next_go; 383349cc55cSDimitry Andric size_t my_go_index; 384349cc55cSDimitry Andric bool group_leader; 385349cc55cSDimitry Andric 386349cc55cSDimitry Andric KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n", 387349cc55cSDimitry Andric gtid, tid, bt)); 388349cc55cSDimitry Andric 389349cc55cSDimitry Andric thr_bar = &this_thr->th.th_bar[bt].bb; 390349cc55cSDimitry Andric 391349cc55cSDimitry Andric if (!KMP_MASTER_TID(tid)) { 392349cc55cSDimitry Andric // workers and non-master group leaders need to check their presence in team 393349cc55cSDimitry Andric do { 394349cc55cSDimitry Andric if (this_thr->th.th_used_in_team.load() != 1 && 395349cc55cSDimitry Andric this_thr->th.th_used_in_team.load() != 3) { 396349cc55cSDimitry Andric // Thread is not in use in a team. Wait on location in tid's thread 397349cc55cSDimitry Andric // struct. The 0 value tells anyone looking that this thread is spinning 398349cc55cSDimitry Andric // or sleeping until this location becomes 3 again; 3 is the transition 399349cc55cSDimitry Andric // state to get to 1 which is waiting on go and being in the team 400349cc55cSDimitry Andric kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3); 401349cc55cSDimitry Andric if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2, 402349cc55cSDimitry Andric 0) || 403349cc55cSDimitry Andric this_thr->th.th_used_in_team.load() == 0) { 404349cc55cSDimitry Andric my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj)); 405349cc55cSDimitry Andric } 406349cc55cSDimitry Andric #if USE_ITT_BUILD && USE_ITT_NOTIFY 407349cc55cSDimitry Andric if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 408349cc55cSDimitry Andric // In fork barrier where we could not get the object reliably 409349cc55cSDimitry Andric itt_sync_obj = 410349cc55cSDimitry Andric __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 411349cc55cSDimitry Andric // Cancel wait on previous parallel region... 412349cc55cSDimitry Andric __kmp_itt_task_starting(itt_sync_obj); 413349cc55cSDimitry Andric 414349cc55cSDimitry Andric if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 415349cc55cSDimitry Andric return; 416349cc55cSDimitry Andric 417349cc55cSDimitry Andric itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 418349cc55cSDimitry Andric if (itt_sync_obj != NULL) 419349cc55cSDimitry Andric // Call prepare as early as possible for "new" barrier 420349cc55cSDimitry Andric __kmp_itt_task_finished(itt_sync_obj); 421349cc55cSDimitry Andric } else 422349cc55cSDimitry Andric #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 423349cc55cSDimitry Andric if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 424349cc55cSDimitry Andric return; 425349cc55cSDimitry Andric } 426349cc55cSDimitry Andric if (this_thr->th.th_used_in_team.load() != 1 && 427349cc55cSDimitry Andric this_thr->th.th_used_in_team.load() != 3) // spurious wake-up? 428349cc55cSDimitry Andric continue; 429349cc55cSDimitry Andric if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 430349cc55cSDimitry Andric return; 431349cc55cSDimitry Andric 432349cc55cSDimitry Andric // At this point, the thread thinks it is in use in a team, or in 433349cc55cSDimitry Andric // transition to be used in a team, but it might have reached this barrier 434349cc55cSDimitry Andric // before it was marked unused by the team. Unused threads are awoken and 435349cc55cSDimitry Andric // shifted to wait on local thread struct elsewhere. It also might reach 436349cc55cSDimitry Andric // this point by being picked up for use by a different team. Either way, 437349cc55cSDimitry Andric // we need to update the tid. 438349cc55cSDimitry Andric tid = __kmp_tid_from_gtid(gtid); 439349cc55cSDimitry Andric team = this_thr->th.th_team; 440349cc55cSDimitry Andric KMP_DEBUG_ASSERT(tid >= 0); 441349cc55cSDimitry Andric KMP_DEBUG_ASSERT(team); 442349cc55cSDimitry Andric b = team->t.b; 443349cc55cSDimitry Andric my_current_iter = b->iter[tid].iter; 444349cc55cSDimitry Andric next_go = my_current_iter + distributedBarrier::MAX_ITERS; 445349cc55cSDimitry Andric my_go_index = tid / b->threads_per_go; 446349cc55cSDimitry Andric if (this_thr->th.th_used_in_team.load() == 3) { 447349cc55cSDimitry Andric KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1); 448349cc55cSDimitry Andric } 449349cc55cSDimitry Andric // Check if go flag is set 450349cc55cSDimitry Andric if (b->go[my_go_index].go.load() != next_go) { 451349cc55cSDimitry Andric // Wait on go flag on team 452349cc55cSDimitry Andric kmp_atomic_flag_64<false, true> my_flag( 453349cc55cSDimitry Andric &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep)); 454349cc55cSDimitry Andric my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj)); 455349cc55cSDimitry Andric KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter || 456349cc55cSDimitry Andric b->iter[tid].iter == 0); 457349cc55cSDimitry Andric KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false); 458349cc55cSDimitry Andric } 459349cc55cSDimitry Andric 460349cc55cSDimitry Andric if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 461349cc55cSDimitry Andric return; 462349cc55cSDimitry Andric // At this point, the thread's go location was set. This means the primary 463349cc55cSDimitry Andric // thread is safely in the barrier, and so this thread's data is 464349cc55cSDimitry Andric // up-to-date, but we should check again that this thread is really in 465349cc55cSDimitry Andric // use in the team, as it could have been woken up for the purpose of 466349cc55cSDimitry Andric // changing team size, or reaping threads at shutdown. 467349cc55cSDimitry Andric if (this_thr->th.th_used_in_team.load() == 1) 468349cc55cSDimitry Andric break; 469349cc55cSDimitry Andric } while (1); 470349cc55cSDimitry Andric 471349cc55cSDimitry Andric if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 472349cc55cSDimitry Andric return; 473349cc55cSDimitry Andric 474349cc55cSDimitry Andric group_leader = ((tid % b->threads_per_group) == 0); 475349cc55cSDimitry Andric if (group_leader) { 476349cc55cSDimitry Andric // Tell all the threads in my group they can go! 477349cc55cSDimitry Andric for (size_t go_idx = my_go_index + 1; 478349cc55cSDimitry Andric go_idx < my_go_index + b->gos_per_group; go_idx++) { 479349cc55cSDimitry Andric b->go[go_idx].go.store(next_go); 480349cc55cSDimitry Andric } 481349cc55cSDimitry Andric // Fence added so that workers can see changes to go. sfence inadequate. 482349cc55cSDimitry Andric KMP_MFENCE(); 483349cc55cSDimitry Andric } 484349cc55cSDimitry Andric 485349cc55cSDimitry Andric #if KMP_BARRIER_ICV_PUSH 486349cc55cSDimitry Andric if (propagate_icvs) { // copy ICVs to final dest 487349cc55cSDimitry Andric __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, 488349cc55cSDimitry Andric tid, FALSE); 489349cc55cSDimitry Andric copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 490349cc55cSDimitry Andric (kmp_internal_control_t *)team->t.b->team_icvs); 491349cc55cSDimitry Andric copy_icvs(&thr_bar->th_fixed_icvs, 492349cc55cSDimitry Andric &team->t.t_implicit_task_taskdata[tid].td_icvs); 493349cc55cSDimitry Andric } 494349cc55cSDimitry Andric #endif 495349cc55cSDimitry Andric if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) { 496349cc55cSDimitry Andric // This thread is now awake and participating in the barrier; 497349cc55cSDimitry Andric // wake up the other threads in the group 498349cc55cSDimitry Andric size_t nproc = this_thr->th.th_team_nproc; 499349cc55cSDimitry Andric size_t group_end = tid + b->threads_per_group; 500349cc55cSDimitry Andric if (nproc < group_end) 501349cc55cSDimitry Andric group_end = nproc; 502349cc55cSDimitry Andric __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid); 503349cc55cSDimitry Andric } 504349cc55cSDimitry Andric } else { // Primary thread 505349cc55cSDimitry Andric team = this_thr->th.th_team; 506349cc55cSDimitry Andric b = team->t.b; 507349cc55cSDimitry Andric my_current_iter = b->iter[tid].iter; 508349cc55cSDimitry Andric next_go = my_current_iter + distributedBarrier::MAX_ITERS; 509349cc55cSDimitry Andric #if KMP_BARRIER_ICV_PUSH 510349cc55cSDimitry Andric if (propagate_icvs) { 511349cc55cSDimitry Andric // primary thread has ICVs in final destination; copy 512349cc55cSDimitry Andric copy_icvs(&thr_bar->th_fixed_icvs, 513349cc55cSDimitry Andric &team->t.t_implicit_task_taskdata[tid].td_icvs); 514349cc55cSDimitry Andric } 515349cc55cSDimitry Andric #endif 516349cc55cSDimitry Andric // Tell all the group leaders they can go! 517349cc55cSDimitry Andric for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) { 518349cc55cSDimitry Andric b->go[go_idx].go.store(next_go); 519349cc55cSDimitry Andric } 520349cc55cSDimitry Andric 521349cc55cSDimitry Andric if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 522349cc55cSDimitry Andric // Wake-up the group leaders 523349cc55cSDimitry Andric size_t nproc = this_thr->th.th_team_nproc; 524349cc55cSDimitry Andric __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc, 525349cc55cSDimitry Andric b->threads_per_group, tid); 526349cc55cSDimitry Andric } 527349cc55cSDimitry Andric 528349cc55cSDimitry Andric // Tell all the threads in my group they can go! 529349cc55cSDimitry Andric for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) { 530349cc55cSDimitry Andric b->go[go_idx].go.store(next_go); 531349cc55cSDimitry Andric } 532349cc55cSDimitry Andric 533349cc55cSDimitry Andric // Fence added so that workers can see changes to go. sfence inadequate. 534349cc55cSDimitry Andric KMP_MFENCE(); 535349cc55cSDimitry Andric 536349cc55cSDimitry Andric if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 537349cc55cSDimitry Andric // Wake-up the other threads in my group 538349cc55cSDimitry Andric size_t nproc = this_thr->th.th_team_nproc; 539349cc55cSDimitry Andric size_t group_end = tid + b->threads_per_group; 540349cc55cSDimitry Andric if (nproc < group_end) 541349cc55cSDimitry Andric group_end = nproc; 542349cc55cSDimitry Andric __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid); 543349cc55cSDimitry Andric } 544349cc55cSDimitry Andric } 545349cc55cSDimitry Andric // Update to next iteration 546349cc55cSDimitry Andric KMP_ASSERT(my_current_iter == b->iter[tid].iter); 547349cc55cSDimitry Andric b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS; 548349cc55cSDimitry Andric 549349cc55cSDimitry Andric KA_TRACE( 550349cc55cSDimitry Andric 20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 551349cc55cSDimitry Andric gtid, team->t.t_id, tid, bt)); 552349cc55cSDimitry Andric } 5530b57cec5SDimitry Andric 5540b57cec5SDimitry Andric // Linear Barrier 5550b57cec5SDimitry Andric template <bool cancellable = false> 5560b57cec5SDimitry Andric static bool __kmp_linear_barrier_gather_template( 5570b57cec5SDimitry Andric enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 5580b57cec5SDimitry Andric void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 5590b57cec5SDimitry Andric KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather); 5600b57cec5SDimitry Andric kmp_team_t *team = this_thr->th.th_team; 5610b57cec5SDimitry Andric kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 5620b57cec5SDimitry Andric kmp_info_t **other_threads = team->t.t_threads; 5630b57cec5SDimitry Andric 5640b57cec5SDimitry Andric KA_TRACE( 5650b57cec5SDimitry Andric 20, 5660b57cec5SDimitry Andric ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 5670b57cec5SDimitry Andric gtid, team->t.t_id, tid, bt)); 5680b57cec5SDimitry Andric KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 5690b57cec5SDimitry Andric 5700b57cec5SDimitry Andric #if USE_ITT_BUILD && USE_ITT_NOTIFY 5710b57cec5SDimitry Andric // Barrier imbalance - save arrive time to the thread 5720b57cec5SDimitry Andric if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 5730b57cec5SDimitry Andric this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 5740b57cec5SDimitry Andric __itt_get_timestamp(); 5750b57cec5SDimitry Andric } 5760b57cec5SDimitry Andric #endif 5770b57cec5SDimitry Andric // We now perform a linear reduction to signal that all of the threads have 5780b57cec5SDimitry Andric // arrived. 5790b57cec5SDimitry Andric if (!KMP_MASTER_TID(tid)) { 5800b57cec5SDimitry Andric KA_TRACE(20, 5810b57cec5SDimitry Andric ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)" 5820b57cec5SDimitry Andric "arrived(%p): %llu => %llu\n", 5830b57cec5SDimitry Andric gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team), 5840b57cec5SDimitry Andric team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived, 5850b57cec5SDimitry Andric thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 586fe6060f1SDimitry Andric // Mark arrival to primary thread 5870b57cec5SDimitry Andric /* After performing this write, a worker thread may not assume that the team 588fe6060f1SDimitry Andric is valid any more - it could be deallocated by the primary thread at any 5890b57cec5SDimitry Andric time. */ 590e8d8bef9SDimitry Andric kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]); 5910b57cec5SDimitry Andric flag.release(); 5920b57cec5SDimitry Andric } else { 5930b57cec5SDimitry Andric kmp_balign_team_t *team_bar = &team->t.t_bar[bt]; 5940b57cec5SDimitry Andric int nproc = this_thr->th.th_team_nproc; 5950b57cec5SDimitry Andric int i; 5960b57cec5SDimitry Andric // Don't have to worry about sleep bit here or atomic since team setting 5970b57cec5SDimitry Andric kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP; 5980b57cec5SDimitry Andric 5990b57cec5SDimitry Andric // Collect all the worker team member threads. 6000b57cec5SDimitry Andric for (i = 1; i < nproc; ++i) { 6010b57cec5SDimitry Andric #if KMP_CACHE_MANAGE 6020b57cec5SDimitry Andric // Prefetch next thread's arrived count 6030b57cec5SDimitry Andric if (i + 1 < nproc) 6040b57cec5SDimitry Andric KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived); 6050b57cec5SDimitry Andric #endif /* KMP_CACHE_MANAGE */ 6060b57cec5SDimitry Andric KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " 6070b57cec5SDimitry Andric "arrived(%p) == %llu\n", 6080b57cec5SDimitry Andric gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), 6090b57cec5SDimitry Andric team->t.t_id, i, 6100b57cec5SDimitry Andric &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state)); 6110b57cec5SDimitry Andric 6120b57cec5SDimitry Andric // Wait for worker thread to arrive 6130b57cec5SDimitry Andric if (cancellable) { 614e8d8bef9SDimitry Andric kmp_flag_64<true, false> flag( 615e8d8bef9SDimitry Andric &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state); 616e8d8bef9SDimitry Andric if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj))) 6170b57cec5SDimitry Andric return true; 6180b57cec5SDimitry Andric } else { 619e8d8bef9SDimitry Andric kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, 620e8d8bef9SDimitry Andric new_state); 6210b57cec5SDimitry Andric flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 6220b57cec5SDimitry Andric } 6230b57cec5SDimitry Andric #if USE_ITT_BUILD && USE_ITT_NOTIFY 6240b57cec5SDimitry Andric // Barrier imbalance - write min of the thread time and the other thread 6250b57cec5SDimitry Andric // time to the thread. 6260b57cec5SDimitry Andric if (__kmp_forkjoin_frames_mode == 2) { 6270b57cec5SDimitry Andric this_thr->th.th_bar_min_time = KMP_MIN( 6280b57cec5SDimitry Andric this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time); 6290b57cec5SDimitry Andric } 6300b57cec5SDimitry Andric #endif 6310b57cec5SDimitry Andric if (reduce) { 6320b57cec5SDimitry Andric KA_TRACE(100, 6330b57cec5SDimitry Andric ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", 6340b57cec5SDimitry Andric gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), 6350b57cec5SDimitry Andric team->t.t_id, i)); 636480093f4SDimitry Andric OMPT_REDUCTION_DECL(this_thr, gtid); 637480093f4SDimitry Andric OMPT_REDUCTION_BEGIN; 6380b57cec5SDimitry Andric (*reduce)(this_thr->th.th_local.reduce_data, 6390b57cec5SDimitry Andric other_threads[i]->th.th_local.reduce_data); 640480093f4SDimitry Andric OMPT_REDUCTION_END; 6410b57cec5SDimitry Andric } 6420b57cec5SDimitry Andric } 6430b57cec5SDimitry Andric // Don't have to worry about sleep bit here or atomic since team setting 6440b57cec5SDimitry Andric team_bar->b_arrived = new_state; 6450b57cec5SDimitry Andric KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d " 6460b57cec5SDimitry Andric "arrived(%p) = %llu\n", 6470b57cec5SDimitry Andric gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, 6480b57cec5SDimitry Andric new_state)); 6490b57cec5SDimitry Andric } 6500b57cec5SDimitry Andric KA_TRACE( 6510b57cec5SDimitry Andric 20, 6520b57cec5SDimitry Andric ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 6530b57cec5SDimitry Andric gtid, team->t.t_id, tid, bt)); 6540b57cec5SDimitry Andric return false; 6550b57cec5SDimitry Andric } 6560b57cec5SDimitry Andric 6570b57cec5SDimitry Andric template <bool cancellable = false> 6580b57cec5SDimitry Andric static bool __kmp_linear_barrier_release_template( 6590b57cec5SDimitry Andric enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 6600b57cec5SDimitry Andric int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 6610b57cec5SDimitry Andric KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release); 6620b57cec5SDimitry Andric kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 6630b57cec5SDimitry Andric kmp_team_t *team; 6640b57cec5SDimitry Andric 6650b57cec5SDimitry Andric if (KMP_MASTER_TID(tid)) { 6660b57cec5SDimitry Andric unsigned int i; 6670b57cec5SDimitry Andric kmp_uint32 nproc = this_thr->th.th_team_nproc; 6680b57cec5SDimitry Andric kmp_info_t **other_threads; 6690b57cec5SDimitry Andric 6700b57cec5SDimitry Andric team = __kmp_threads[gtid]->th.th_team; 6710b57cec5SDimitry Andric KMP_DEBUG_ASSERT(team != NULL); 6720b57cec5SDimitry Andric other_threads = team->t.t_threads; 6730b57cec5SDimitry Andric 674fe6060f1SDimitry Andric KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for " 6750b57cec5SDimitry Andric "barrier type %d\n", 6760b57cec5SDimitry Andric gtid, team->t.t_id, tid, bt)); 6770b57cec5SDimitry Andric 6780b57cec5SDimitry Andric if (nproc > 1) { 6790b57cec5SDimitry Andric #if KMP_BARRIER_ICV_PUSH 6800b57cec5SDimitry Andric { 6810b57cec5SDimitry Andric KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 6820b57cec5SDimitry Andric if (propagate_icvs) { 6830b57cec5SDimitry Andric ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs); 6840b57cec5SDimitry Andric for (i = 1; i < nproc; ++i) { 6850b57cec5SDimitry Andric __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], 6860b57cec5SDimitry Andric team, i, FALSE); 6870b57cec5SDimitry Andric ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, 6880b57cec5SDimitry Andric &team->t.t_implicit_task_taskdata[0].td_icvs); 6890b57cec5SDimitry Andric } 6900b57cec5SDimitry Andric ngo_sync(); 6910b57cec5SDimitry Andric } 6920b57cec5SDimitry Andric } 6930b57cec5SDimitry Andric #endif // KMP_BARRIER_ICV_PUSH 6940b57cec5SDimitry Andric 6950b57cec5SDimitry Andric // Now, release all of the worker threads 6960b57cec5SDimitry Andric for (i = 1; i < nproc; ++i) { 6970b57cec5SDimitry Andric #if KMP_CACHE_MANAGE 6980b57cec5SDimitry Andric // Prefetch next thread's go flag 6990b57cec5SDimitry Andric if (i + 1 < nproc) 7000b57cec5SDimitry Andric KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go); 7010b57cec5SDimitry Andric #endif /* KMP_CACHE_MANAGE */ 7020b57cec5SDimitry Andric KA_TRACE( 7030b57cec5SDimitry Andric 20, 7040b57cec5SDimitry Andric ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) " 7050b57cec5SDimitry Andric "go(%p): %u => %u\n", 7060b57cec5SDimitry Andric gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid, 7070b57cec5SDimitry Andric team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go, 7080b57cec5SDimitry Andric other_threads[i]->th.th_bar[bt].bb.b_go, 7090b57cec5SDimitry Andric other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP)); 710e8d8bef9SDimitry Andric kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go, 7110b57cec5SDimitry Andric other_threads[i]); 7120b57cec5SDimitry Andric flag.release(); 7130b57cec5SDimitry Andric } 7140b57cec5SDimitry Andric } 715fe6060f1SDimitry Andric } else { // Wait for the PRIMARY thread to release us 7160b57cec5SDimitry Andric KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n", 7170b57cec5SDimitry Andric gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 7180b57cec5SDimitry Andric if (cancellable) { 719e8d8bef9SDimitry Andric kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 720e8d8bef9SDimitry Andric if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj))) 7210b57cec5SDimitry Andric return true; 7220b57cec5SDimitry Andric } else { 723e8d8bef9SDimitry Andric kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 7240b57cec5SDimitry Andric flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 7250b57cec5SDimitry Andric } 7260b57cec5SDimitry Andric #if USE_ITT_BUILD && USE_ITT_NOTIFY 7270b57cec5SDimitry Andric if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 7280b57cec5SDimitry Andric // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is 7290b57cec5SDimitry Andric // disabled) 7300b57cec5SDimitry Andric itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 7310b57cec5SDimitry Andric // Cancel wait on previous parallel region... 7320b57cec5SDimitry Andric __kmp_itt_task_starting(itt_sync_obj); 7330b57cec5SDimitry Andric 7340b57cec5SDimitry Andric if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 7350b57cec5SDimitry Andric return false; 7360b57cec5SDimitry Andric 7370b57cec5SDimitry Andric itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 7380b57cec5SDimitry Andric if (itt_sync_obj != NULL) 7390b57cec5SDimitry Andric // Call prepare as early as possible for "new" barrier 7400b57cec5SDimitry Andric __kmp_itt_task_finished(itt_sync_obj); 7410b57cec5SDimitry Andric } else 7420b57cec5SDimitry Andric #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 7430b57cec5SDimitry Andric // Early exit for reaping threads releasing forkjoin barrier 7440b57cec5SDimitry Andric if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 7450b57cec5SDimitry Andric return false; 7460b57cec5SDimitry Andric // The worker thread may now assume that the team is valid. 7470b57cec5SDimitry Andric #ifdef KMP_DEBUG 7480b57cec5SDimitry Andric tid = __kmp_tid_from_gtid(gtid); 7490b57cec5SDimitry Andric team = __kmp_threads[gtid]->th.th_team; 7500b57cec5SDimitry Andric #endif 7510b57cec5SDimitry Andric KMP_DEBUG_ASSERT(team != NULL); 7520b57cec5SDimitry Andric TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 7530b57cec5SDimitry Andric KA_TRACE(20, 7540b57cec5SDimitry Andric ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 7550b57cec5SDimitry Andric gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 7560b57cec5SDimitry Andric KMP_MB(); // Flush all pending memory write invalidates. 7570b57cec5SDimitry Andric } 7580b57cec5SDimitry Andric KA_TRACE( 7590b57cec5SDimitry Andric 20, 7600b57cec5SDimitry Andric ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 7610b57cec5SDimitry Andric gtid, team->t.t_id, tid, bt)); 7620b57cec5SDimitry Andric return false; 7630b57cec5SDimitry Andric } 7640b57cec5SDimitry Andric 7650b57cec5SDimitry Andric static void __kmp_linear_barrier_gather( 7660b57cec5SDimitry Andric enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 7670b57cec5SDimitry Andric void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 7680b57cec5SDimitry Andric __kmp_linear_barrier_gather_template<false>( 7690b57cec5SDimitry Andric bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 7700b57cec5SDimitry Andric } 7710b57cec5SDimitry Andric 7720b57cec5SDimitry Andric static bool __kmp_linear_barrier_gather_cancellable( 7730b57cec5SDimitry Andric enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 7740b57cec5SDimitry Andric void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 7750b57cec5SDimitry Andric return __kmp_linear_barrier_gather_template<true>( 7760b57cec5SDimitry Andric bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 7770b57cec5SDimitry Andric } 7780b57cec5SDimitry Andric 7790b57cec5SDimitry Andric static void __kmp_linear_barrier_release( 7800b57cec5SDimitry Andric enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 7810b57cec5SDimitry Andric int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 7820b57cec5SDimitry Andric __kmp_linear_barrier_release_template<false>( 7830b57cec5SDimitry Andric bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); 7840b57cec5SDimitry Andric } 7850b57cec5SDimitry Andric 7860b57cec5SDimitry Andric static bool __kmp_linear_barrier_release_cancellable( 7870b57cec5SDimitry Andric enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 7880b57cec5SDimitry Andric int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 7890b57cec5SDimitry Andric return __kmp_linear_barrier_release_template<true>( 7900b57cec5SDimitry Andric bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); 7910b57cec5SDimitry Andric } 7920b57cec5SDimitry Andric 7930b57cec5SDimitry Andric // Tree barrier 794fe6060f1SDimitry Andric static void __kmp_tree_barrier_gather( 795fe6060f1SDimitry Andric enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 796fe6060f1SDimitry Andric void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 7970b57cec5SDimitry Andric KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather); 7980b57cec5SDimitry Andric kmp_team_t *team = this_thr->th.th_team; 7990b57cec5SDimitry Andric kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 8000b57cec5SDimitry Andric kmp_info_t **other_threads = team->t.t_threads; 8010b57cec5SDimitry Andric kmp_uint32 nproc = this_thr->th.th_team_nproc; 8020b57cec5SDimitry Andric kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 8030b57cec5SDimitry Andric kmp_uint32 branch_factor = 1 << branch_bits; 8040b57cec5SDimitry Andric kmp_uint32 child; 8050b57cec5SDimitry Andric kmp_uint32 child_tid; 806fe6060f1SDimitry Andric kmp_uint64 new_state = 0; 8070b57cec5SDimitry Andric 8080b57cec5SDimitry Andric KA_TRACE( 8090b57cec5SDimitry Andric 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 8100b57cec5SDimitry Andric gtid, team->t.t_id, tid, bt)); 8110b57cec5SDimitry Andric KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 8120b57cec5SDimitry Andric 8130b57cec5SDimitry Andric #if USE_ITT_BUILD && USE_ITT_NOTIFY 8140b57cec5SDimitry Andric // Barrier imbalance - save arrive time to the thread 8150b57cec5SDimitry Andric if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 8160b57cec5SDimitry Andric this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 8170b57cec5SDimitry Andric __itt_get_timestamp(); 8180b57cec5SDimitry Andric } 8190b57cec5SDimitry Andric #endif 8200b57cec5SDimitry Andric // Perform tree gather to wait until all threads have arrived; reduce any 8210b57cec5SDimitry Andric // required data as we go 8220b57cec5SDimitry Andric child_tid = (tid << branch_bits) + 1; 8230b57cec5SDimitry Andric if (child_tid < nproc) { 8240b57cec5SDimitry Andric // Parent threads wait for all their children to arrive 8250b57cec5SDimitry Andric new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 8260b57cec5SDimitry Andric child = 1; 8270b57cec5SDimitry Andric do { 8280b57cec5SDimitry Andric kmp_info_t *child_thr = other_threads[child_tid]; 8290b57cec5SDimitry Andric kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 8300b57cec5SDimitry Andric #if KMP_CACHE_MANAGE 8310b57cec5SDimitry Andric // Prefetch next thread's arrived count 8320b57cec5SDimitry Andric if (child + 1 <= branch_factor && child_tid + 1 < nproc) 8330b57cec5SDimitry Andric KMP_CACHE_PREFETCH( 8340b57cec5SDimitry Andric &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived); 8350b57cec5SDimitry Andric #endif /* KMP_CACHE_MANAGE */ 8360b57cec5SDimitry Andric KA_TRACE(20, 8370b57cec5SDimitry Andric ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 8380b57cec5SDimitry Andric "arrived(%p) == %llu\n", 8390b57cec5SDimitry Andric gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 8400b57cec5SDimitry Andric team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 8410b57cec5SDimitry Andric // Wait for child to arrive 842e8d8bef9SDimitry Andric kmp_flag_64<> flag(&child_bar->b_arrived, new_state); 8430b57cec5SDimitry Andric flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 8440b57cec5SDimitry Andric #if USE_ITT_BUILD && USE_ITT_NOTIFY 8450b57cec5SDimitry Andric // Barrier imbalance - write min of the thread time and a child time to 8460b57cec5SDimitry Andric // the thread. 8470b57cec5SDimitry Andric if (__kmp_forkjoin_frames_mode == 2) { 8480b57cec5SDimitry Andric this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 8490b57cec5SDimitry Andric child_thr->th.th_bar_min_time); 8500b57cec5SDimitry Andric } 8510b57cec5SDimitry Andric #endif 8520b57cec5SDimitry Andric if (reduce) { 8530b57cec5SDimitry Andric KA_TRACE(100, 8540b57cec5SDimitry Andric ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 8550b57cec5SDimitry Andric gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 8560b57cec5SDimitry Andric team->t.t_id, child_tid)); 857480093f4SDimitry Andric OMPT_REDUCTION_DECL(this_thr, gtid); 858480093f4SDimitry Andric OMPT_REDUCTION_BEGIN; 8590b57cec5SDimitry Andric (*reduce)(this_thr->th.th_local.reduce_data, 8600b57cec5SDimitry Andric child_thr->th.th_local.reduce_data); 861480093f4SDimitry Andric OMPT_REDUCTION_END; 8620b57cec5SDimitry Andric } 8630b57cec5SDimitry Andric child++; 8640b57cec5SDimitry Andric child_tid++; 8650b57cec5SDimitry Andric } while (child <= branch_factor && child_tid < nproc); 8660b57cec5SDimitry Andric } 8670b57cec5SDimitry Andric 8680b57cec5SDimitry Andric if (!KMP_MASTER_TID(tid)) { // Worker threads 8690b57cec5SDimitry Andric kmp_int32 parent_tid = (tid - 1) >> branch_bits; 8700b57cec5SDimitry Andric 8710b57cec5SDimitry Andric KA_TRACE(20, 8720b57cec5SDimitry Andric ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 8730b57cec5SDimitry Andric "arrived(%p): %llu => %llu\n", 8740b57cec5SDimitry Andric gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), 8750b57cec5SDimitry Andric team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, 8760b57cec5SDimitry Andric thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 8770b57cec5SDimitry Andric 8780b57cec5SDimitry Andric // Mark arrival to parent thread 8790b57cec5SDimitry Andric /* After performing this write, a worker thread may not assume that the team 880fe6060f1SDimitry Andric is valid any more - it could be deallocated by the primary thread at any 8810b57cec5SDimitry Andric time. */ 882e8d8bef9SDimitry Andric kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]); 8830b57cec5SDimitry Andric flag.release(); 8840b57cec5SDimitry Andric } else { 885fe6060f1SDimitry Andric // Need to update the team arrived pointer if we are the primary thread 8860b57cec5SDimitry Andric if (nproc > 1) // New value was already computed above 8870b57cec5SDimitry Andric team->t.t_bar[bt].b_arrived = new_state; 8880b57cec5SDimitry Andric else 8890b57cec5SDimitry Andric team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 8900b57cec5SDimitry Andric KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d " 8910b57cec5SDimitry Andric "arrived(%p) = %llu\n", 8920b57cec5SDimitry Andric gtid, team->t.t_id, tid, team->t.t_id, 8930b57cec5SDimitry Andric &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 8940b57cec5SDimitry Andric } 8950b57cec5SDimitry Andric KA_TRACE(20, 8960b57cec5SDimitry Andric ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 8970b57cec5SDimitry Andric gtid, team->t.t_id, tid, bt)); 8980b57cec5SDimitry Andric } 8990b57cec5SDimitry Andric 9000b57cec5SDimitry Andric static void __kmp_tree_barrier_release( 9010b57cec5SDimitry Andric enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 9020b57cec5SDimitry Andric int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 9030b57cec5SDimitry Andric KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release); 9040b57cec5SDimitry Andric kmp_team_t *team; 9050b57cec5SDimitry Andric kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 9060b57cec5SDimitry Andric kmp_uint32 nproc; 9070b57cec5SDimitry Andric kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; 9080b57cec5SDimitry Andric kmp_uint32 branch_factor = 1 << branch_bits; 9090b57cec5SDimitry Andric kmp_uint32 child; 9100b57cec5SDimitry Andric kmp_uint32 child_tid; 9110b57cec5SDimitry Andric 9120b57cec5SDimitry Andric // Perform a tree release for all of the threads that have been gathered 9130b57cec5SDimitry Andric if (!KMP_MASTER_TID( 9140b57cec5SDimitry Andric tid)) { // Handle fork barrier workers who aren't part of a team yet 9150b57cec5SDimitry Andric KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid, 9160b57cec5SDimitry Andric &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 9170b57cec5SDimitry Andric // Wait for parent thread to release us 918e8d8bef9SDimitry Andric kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 9190b57cec5SDimitry Andric flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 9200b57cec5SDimitry Andric #if USE_ITT_BUILD && USE_ITT_NOTIFY 9210b57cec5SDimitry Andric if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 9220b57cec5SDimitry Andric // In fork barrier where we could not get the object reliably (or 9230b57cec5SDimitry Andric // ITTNOTIFY is disabled) 9240b57cec5SDimitry Andric itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 9250b57cec5SDimitry Andric // Cancel wait on previous parallel region... 9260b57cec5SDimitry Andric __kmp_itt_task_starting(itt_sync_obj); 9270b57cec5SDimitry Andric 9280b57cec5SDimitry Andric if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 9290b57cec5SDimitry Andric return; 9300b57cec5SDimitry Andric 9310b57cec5SDimitry Andric itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 9320b57cec5SDimitry Andric if (itt_sync_obj != NULL) 9330b57cec5SDimitry Andric // Call prepare as early as possible for "new" barrier 9340b57cec5SDimitry Andric __kmp_itt_task_finished(itt_sync_obj); 9350b57cec5SDimitry Andric } else 9360b57cec5SDimitry Andric #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 9370b57cec5SDimitry Andric // Early exit for reaping threads releasing forkjoin barrier 9380b57cec5SDimitry Andric if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 9390b57cec5SDimitry Andric return; 9400b57cec5SDimitry Andric 9410b57cec5SDimitry Andric // The worker thread may now assume that the team is valid. 9420b57cec5SDimitry Andric team = __kmp_threads[gtid]->th.th_team; 9430b57cec5SDimitry Andric KMP_DEBUG_ASSERT(team != NULL); 9440b57cec5SDimitry Andric tid = __kmp_tid_from_gtid(gtid); 9450b57cec5SDimitry Andric 9460b57cec5SDimitry Andric TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 9470b57cec5SDimitry Andric KA_TRACE(20, 9480b57cec5SDimitry Andric ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid, 9490b57cec5SDimitry Andric team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 9500b57cec5SDimitry Andric KMP_MB(); // Flush all pending memory write invalidates. 9510b57cec5SDimitry Andric } else { 9520b57cec5SDimitry Andric team = __kmp_threads[gtid]->th.th_team; 9530b57cec5SDimitry Andric KMP_DEBUG_ASSERT(team != NULL); 954fe6060f1SDimitry Andric KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for " 9550b57cec5SDimitry Andric "barrier type %d\n", 9560b57cec5SDimitry Andric gtid, team->t.t_id, tid, bt)); 9570b57cec5SDimitry Andric } 9580b57cec5SDimitry Andric nproc = this_thr->th.th_team_nproc; 9590b57cec5SDimitry Andric child_tid = (tid << branch_bits) + 1; 9600b57cec5SDimitry Andric 9610b57cec5SDimitry Andric if (child_tid < nproc) { 9620b57cec5SDimitry Andric kmp_info_t **other_threads = team->t.t_threads; 9630b57cec5SDimitry Andric child = 1; 9640b57cec5SDimitry Andric // Parent threads release all their children 9650b57cec5SDimitry Andric do { 9660b57cec5SDimitry Andric kmp_info_t *child_thr = other_threads[child_tid]; 9670b57cec5SDimitry Andric kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 9680b57cec5SDimitry Andric #if KMP_CACHE_MANAGE 9690b57cec5SDimitry Andric // Prefetch next thread's go count 9700b57cec5SDimitry Andric if (child + 1 <= branch_factor && child_tid + 1 < nproc) 9710b57cec5SDimitry Andric KMP_CACHE_PREFETCH( 9720b57cec5SDimitry Andric &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go); 9730b57cec5SDimitry Andric #endif /* KMP_CACHE_MANAGE */ 9740b57cec5SDimitry Andric 9750b57cec5SDimitry Andric #if KMP_BARRIER_ICV_PUSH 9760b57cec5SDimitry Andric { 9770b57cec5SDimitry Andric KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 9780b57cec5SDimitry Andric if (propagate_icvs) { 9790b57cec5SDimitry Andric __kmp_init_implicit_task(team->t.t_ident, 9800b57cec5SDimitry Andric team->t.t_threads[child_tid], team, 9810b57cec5SDimitry Andric child_tid, FALSE); 9820b57cec5SDimitry Andric copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, 9830b57cec5SDimitry Andric &team->t.t_implicit_task_taskdata[0].td_icvs); 9840b57cec5SDimitry Andric } 9850b57cec5SDimitry Andric } 9860b57cec5SDimitry Andric #endif // KMP_BARRIER_ICV_PUSH 9870b57cec5SDimitry Andric KA_TRACE(20, 9880b57cec5SDimitry Andric ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 9890b57cec5SDimitry Andric "go(%p): %u => %u\n", 9900b57cec5SDimitry Andric gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 9910b57cec5SDimitry Andric team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 9920b57cec5SDimitry Andric child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 9930b57cec5SDimitry Andric // Release child from barrier 994e8d8bef9SDimitry Andric kmp_flag_64<> flag(&child_bar->b_go, child_thr); 9950b57cec5SDimitry Andric flag.release(); 9960b57cec5SDimitry Andric child++; 9970b57cec5SDimitry Andric child_tid++; 9980b57cec5SDimitry Andric } while (child <= branch_factor && child_tid < nproc); 9990b57cec5SDimitry Andric } 10000b57cec5SDimitry Andric KA_TRACE( 10010b57cec5SDimitry Andric 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 10020b57cec5SDimitry Andric gtid, team->t.t_id, tid, bt)); 10030b57cec5SDimitry Andric } 10040b57cec5SDimitry Andric 10050b57cec5SDimitry Andric // Hyper Barrier 1006fe6060f1SDimitry Andric static void __kmp_hyper_barrier_gather( 1007fe6060f1SDimitry Andric enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 1008fe6060f1SDimitry Andric void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 10090b57cec5SDimitry Andric KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather); 10100b57cec5SDimitry Andric kmp_team_t *team = this_thr->th.th_team; 10110b57cec5SDimitry Andric kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 10120b57cec5SDimitry Andric kmp_info_t **other_threads = team->t.t_threads; 10130b57cec5SDimitry Andric kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE; 10140b57cec5SDimitry Andric kmp_uint32 num_threads = this_thr->th.th_team_nproc; 10150b57cec5SDimitry Andric kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 10160b57cec5SDimitry Andric kmp_uint32 branch_factor = 1 << branch_bits; 10170b57cec5SDimitry Andric kmp_uint32 offset; 10180b57cec5SDimitry Andric kmp_uint32 level; 10190b57cec5SDimitry Andric 10200b57cec5SDimitry Andric KA_TRACE( 10210b57cec5SDimitry Andric 20, 10220b57cec5SDimitry Andric ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 10230b57cec5SDimitry Andric gtid, team->t.t_id, tid, bt)); 10240b57cec5SDimitry Andric KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 10250b57cec5SDimitry Andric 10260b57cec5SDimitry Andric #if USE_ITT_BUILD && USE_ITT_NOTIFY 10270b57cec5SDimitry Andric // Barrier imbalance - save arrive time to the thread 10280b57cec5SDimitry Andric if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 10290b57cec5SDimitry Andric this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 10300b57cec5SDimitry Andric __itt_get_timestamp(); 10310b57cec5SDimitry Andric } 10320b57cec5SDimitry Andric #endif 10330b57cec5SDimitry Andric /* Perform a hypercube-embedded tree gather to wait until all of the threads 10340b57cec5SDimitry Andric have arrived, and reduce any required data as we go. */ 1035e8d8bef9SDimitry Andric kmp_flag_64<> p_flag(&thr_bar->b_arrived); 10360b57cec5SDimitry Andric for (level = 0, offset = 1; offset < num_threads; 10370b57cec5SDimitry Andric level += branch_bits, offset <<= branch_bits) { 10380b57cec5SDimitry Andric kmp_uint32 child; 10390b57cec5SDimitry Andric kmp_uint32 child_tid; 10400b57cec5SDimitry Andric 10410b57cec5SDimitry Andric if (((tid >> level) & (branch_factor - 1)) != 0) { 10420b57cec5SDimitry Andric kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1); 10430b57cec5SDimitry Andric 10445ffd83dbSDimitry Andric KMP_MB(); // Synchronize parent and child threads. 10450b57cec5SDimitry Andric KA_TRACE(20, 10460b57cec5SDimitry Andric ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 10470b57cec5SDimitry Andric "arrived(%p): %llu => %llu\n", 10480b57cec5SDimitry Andric gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), 10490b57cec5SDimitry Andric team->t.t_id, parent_tid, &thr_bar->b_arrived, 10500b57cec5SDimitry Andric thr_bar->b_arrived, 10510b57cec5SDimitry Andric thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 10520b57cec5SDimitry Andric // Mark arrival to parent thread 10530b57cec5SDimitry Andric /* After performing this write (in the last iteration of the enclosing for 10540b57cec5SDimitry Andric loop), a worker thread may not assume that the team is valid any more 1055fe6060f1SDimitry Andric - it could be deallocated by the primary thread at any time. */ 10560b57cec5SDimitry Andric p_flag.set_waiter(other_threads[parent_tid]); 10570b57cec5SDimitry Andric p_flag.release(); 10580b57cec5SDimitry Andric break; 10590b57cec5SDimitry Andric } 10600b57cec5SDimitry Andric 10610b57cec5SDimitry Andric // Parent threads wait for children to arrive 10620b57cec5SDimitry Andric if (new_state == KMP_BARRIER_UNUSED_STATE) 10630b57cec5SDimitry Andric new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 10640b57cec5SDimitry Andric for (child = 1, child_tid = tid + (1 << level); 10650b57cec5SDimitry Andric child < branch_factor && child_tid < num_threads; 10660b57cec5SDimitry Andric child++, child_tid += (1 << level)) { 10670b57cec5SDimitry Andric kmp_info_t *child_thr = other_threads[child_tid]; 10680b57cec5SDimitry Andric kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 10690b57cec5SDimitry Andric #if KMP_CACHE_MANAGE 10700b57cec5SDimitry Andric kmp_uint32 next_child_tid = child_tid + (1 << level); 10710b57cec5SDimitry Andric // Prefetch next thread's arrived count 10720b57cec5SDimitry Andric if (child + 1 < branch_factor && next_child_tid < num_threads) 10730b57cec5SDimitry Andric KMP_CACHE_PREFETCH( 10740b57cec5SDimitry Andric &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived); 10750b57cec5SDimitry Andric #endif /* KMP_CACHE_MANAGE */ 10760b57cec5SDimitry Andric KA_TRACE(20, 10770b57cec5SDimitry Andric ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 10780b57cec5SDimitry Andric "arrived(%p) == %llu\n", 10790b57cec5SDimitry Andric gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 10800b57cec5SDimitry Andric team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 10810b57cec5SDimitry Andric // Wait for child to arrive 1082e8d8bef9SDimitry Andric kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state); 10830b57cec5SDimitry Andric c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 10845ffd83dbSDimitry Andric KMP_MB(); // Synchronize parent and child threads. 10850b57cec5SDimitry Andric #if USE_ITT_BUILD && USE_ITT_NOTIFY 10860b57cec5SDimitry Andric // Barrier imbalance - write min of the thread time and a child time to 10870b57cec5SDimitry Andric // the thread. 10880b57cec5SDimitry Andric if (__kmp_forkjoin_frames_mode == 2) { 10890b57cec5SDimitry Andric this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 10900b57cec5SDimitry Andric child_thr->th.th_bar_min_time); 10910b57cec5SDimitry Andric } 10920b57cec5SDimitry Andric #endif 10930b57cec5SDimitry Andric if (reduce) { 10940b57cec5SDimitry Andric KA_TRACE(100, 10950b57cec5SDimitry Andric ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 10960b57cec5SDimitry Andric gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 10970b57cec5SDimitry Andric team->t.t_id, child_tid)); 1098480093f4SDimitry Andric OMPT_REDUCTION_DECL(this_thr, gtid); 1099480093f4SDimitry Andric OMPT_REDUCTION_BEGIN; 11000b57cec5SDimitry Andric (*reduce)(this_thr->th.th_local.reduce_data, 11010b57cec5SDimitry Andric child_thr->th.th_local.reduce_data); 1102480093f4SDimitry Andric OMPT_REDUCTION_END; 11030b57cec5SDimitry Andric } 11040b57cec5SDimitry Andric } 11050b57cec5SDimitry Andric } 11060b57cec5SDimitry Andric 11070b57cec5SDimitry Andric if (KMP_MASTER_TID(tid)) { 1108fe6060f1SDimitry Andric // Need to update the team arrived pointer if we are the primary thread 11090b57cec5SDimitry Andric if (new_state == KMP_BARRIER_UNUSED_STATE) 11100b57cec5SDimitry Andric team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 11110b57cec5SDimitry Andric else 11120b57cec5SDimitry Andric team->t.t_bar[bt].b_arrived = new_state; 11130b57cec5SDimitry Andric KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d " 11140b57cec5SDimitry Andric "arrived(%p) = %llu\n", 11150b57cec5SDimitry Andric gtid, team->t.t_id, tid, team->t.t_id, 11160b57cec5SDimitry Andric &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 11170b57cec5SDimitry Andric } 11180b57cec5SDimitry Andric KA_TRACE( 11190b57cec5SDimitry Andric 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 11200b57cec5SDimitry Andric gtid, team->t.t_id, tid, bt)); 11210b57cec5SDimitry Andric } 11220b57cec5SDimitry Andric 11230b57cec5SDimitry Andric // The reverse versions seem to beat the forward versions overall 11240b57cec5SDimitry Andric #define KMP_REVERSE_HYPER_BAR 11250b57cec5SDimitry Andric static void __kmp_hyper_barrier_release( 11260b57cec5SDimitry Andric enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 11270b57cec5SDimitry Andric int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 11280b57cec5SDimitry Andric KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release); 11290b57cec5SDimitry Andric kmp_team_t *team; 11300b57cec5SDimitry Andric kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 11310b57cec5SDimitry Andric kmp_info_t **other_threads; 11320b57cec5SDimitry Andric kmp_uint32 num_threads; 11330b57cec5SDimitry Andric kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; 11340b57cec5SDimitry Andric kmp_uint32 branch_factor = 1 << branch_bits; 11350b57cec5SDimitry Andric kmp_uint32 child; 11360b57cec5SDimitry Andric kmp_uint32 child_tid; 11370b57cec5SDimitry Andric kmp_uint32 offset; 11380b57cec5SDimitry Andric kmp_uint32 level; 11390b57cec5SDimitry Andric 11400b57cec5SDimitry Andric /* Perform a hypercube-embedded tree release for all of the threads that have 11410b57cec5SDimitry Andric been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads 11420b57cec5SDimitry Andric are released in the reverse order of the corresponding gather, otherwise 11430b57cec5SDimitry Andric threads are released in the same order. */ 1144fe6060f1SDimitry Andric if (KMP_MASTER_TID(tid)) { // primary thread 11450b57cec5SDimitry Andric team = __kmp_threads[gtid]->th.th_team; 11460b57cec5SDimitry Andric KMP_DEBUG_ASSERT(team != NULL); 1147fe6060f1SDimitry Andric KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for " 11480b57cec5SDimitry Andric "barrier type %d\n", 11490b57cec5SDimitry Andric gtid, team->t.t_id, tid, bt)); 11500b57cec5SDimitry Andric #if KMP_BARRIER_ICV_PUSH 1151fe6060f1SDimitry Andric if (propagate_icvs) { // primary already has ICVs in final destination; copy 11520b57cec5SDimitry Andric copy_icvs(&thr_bar->th_fixed_icvs, 11530b57cec5SDimitry Andric &team->t.t_implicit_task_taskdata[tid].td_icvs); 11540b57cec5SDimitry Andric } 11550b57cec5SDimitry Andric #endif 11560b57cec5SDimitry Andric } else { // Handle fork barrier workers who aren't part of a team yet 11570b57cec5SDimitry Andric KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid, 11580b57cec5SDimitry Andric &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 11590b57cec5SDimitry Andric // Wait for parent thread to release us 1160e8d8bef9SDimitry Andric kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 11610b57cec5SDimitry Andric flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 11620b57cec5SDimitry Andric #if USE_ITT_BUILD && USE_ITT_NOTIFY 11630b57cec5SDimitry Andric if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 11640b57cec5SDimitry Andric // In fork barrier where we could not get the object reliably 11650b57cec5SDimitry Andric itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 11660b57cec5SDimitry Andric // Cancel wait on previous parallel region... 11670b57cec5SDimitry Andric __kmp_itt_task_starting(itt_sync_obj); 11680b57cec5SDimitry Andric 11690b57cec5SDimitry Andric if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 11700b57cec5SDimitry Andric return; 11710b57cec5SDimitry Andric 11720b57cec5SDimitry Andric itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 11730b57cec5SDimitry Andric if (itt_sync_obj != NULL) 11740b57cec5SDimitry Andric // Call prepare as early as possible for "new" barrier 11750b57cec5SDimitry Andric __kmp_itt_task_finished(itt_sync_obj); 11760b57cec5SDimitry Andric } else 11770b57cec5SDimitry Andric #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 11780b57cec5SDimitry Andric // Early exit for reaping threads releasing forkjoin barrier 11790b57cec5SDimitry Andric if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 11800b57cec5SDimitry Andric return; 11810b57cec5SDimitry Andric 11820b57cec5SDimitry Andric // The worker thread may now assume that the team is valid. 11830b57cec5SDimitry Andric team = __kmp_threads[gtid]->th.th_team; 11840b57cec5SDimitry Andric KMP_DEBUG_ASSERT(team != NULL); 11850b57cec5SDimitry Andric tid = __kmp_tid_from_gtid(gtid); 11860b57cec5SDimitry Andric 11870b57cec5SDimitry Andric TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 11880b57cec5SDimitry Andric KA_TRACE(20, 11890b57cec5SDimitry Andric ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 11900b57cec5SDimitry Andric gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 11910b57cec5SDimitry Andric KMP_MB(); // Flush all pending memory write invalidates. 11920b57cec5SDimitry Andric } 11930b57cec5SDimitry Andric num_threads = this_thr->th.th_team_nproc; 11940b57cec5SDimitry Andric other_threads = team->t.t_threads; 11950b57cec5SDimitry Andric 11960b57cec5SDimitry Andric #ifdef KMP_REVERSE_HYPER_BAR 11970b57cec5SDimitry Andric // Count up to correct level for parent 11980b57cec5SDimitry Andric for (level = 0, offset = 1; 11990b57cec5SDimitry Andric offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0); 12000b57cec5SDimitry Andric level += branch_bits, offset <<= branch_bits) 12010b57cec5SDimitry Andric ; 12020b57cec5SDimitry Andric 12030b57cec5SDimitry Andric // Now go down from there 12040b57cec5SDimitry Andric for (level -= branch_bits, offset >>= branch_bits; offset != 0; 12050b57cec5SDimitry Andric level -= branch_bits, offset >>= branch_bits) 12060b57cec5SDimitry Andric #else 12070b57cec5SDimitry Andric // Go down the tree, level by level 12080b57cec5SDimitry Andric for (level = 0, offset = 1; offset < num_threads; 12090b57cec5SDimitry Andric level += branch_bits, offset <<= branch_bits) 12100b57cec5SDimitry Andric #endif // KMP_REVERSE_HYPER_BAR 12110b57cec5SDimitry Andric { 12120b57cec5SDimitry Andric #ifdef KMP_REVERSE_HYPER_BAR 12130b57cec5SDimitry Andric /* Now go in reverse order through the children, highest to lowest. 12140b57cec5SDimitry Andric Initial setting of child is conservative here. */ 12150b57cec5SDimitry Andric child = num_threads >> ((level == 0) ? level : level - 1); 12160b57cec5SDimitry Andric for (child = (child < branch_factor - 1) ? child : branch_factor - 1, 12170b57cec5SDimitry Andric child_tid = tid + (child << level); 12180b57cec5SDimitry Andric child >= 1; child--, child_tid -= (1 << level)) 12190b57cec5SDimitry Andric #else 12200b57cec5SDimitry Andric if (((tid >> level) & (branch_factor - 1)) != 0) 12210b57cec5SDimitry Andric // No need to go lower than this, since this is the level parent would be 12220b57cec5SDimitry Andric // notified 12230b57cec5SDimitry Andric break; 12240b57cec5SDimitry Andric // Iterate through children on this level of the tree 12250b57cec5SDimitry Andric for (child = 1, child_tid = tid + (1 << level); 12260b57cec5SDimitry Andric child < branch_factor && child_tid < num_threads; 12270b57cec5SDimitry Andric child++, child_tid += (1 << level)) 12280b57cec5SDimitry Andric #endif // KMP_REVERSE_HYPER_BAR 12290b57cec5SDimitry Andric { 12300b57cec5SDimitry Andric if (child_tid >= num_threads) 12310b57cec5SDimitry Andric continue; // Child doesn't exist so keep going 12320b57cec5SDimitry Andric else { 12330b57cec5SDimitry Andric kmp_info_t *child_thr = other_threads[child_tid]; 12340b57cec5SDimitry Andric kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 12350b57cec5SDimitry Andric #if KMP_CACHE_MANAGE 12360b57cec5SDimitry Andric kmp_uint32 next_child_tid = child_tid - (1 << level); 12370b57cec5SDimitry Andric // Prefetch next thread's go count 12380b57cec5SDimitry Andric #ifdef KMP_REVERSE_HYPER_BAR 12390b57cec5SDimitry Andric if (child - 1 >= 1 && next_child_tid < num_threads) 12400b57cec5SDimitry Andric #else 12410b57cec5SDimitry Andric if (child + 1 < branch_factor && next_child_tid < num_threads) 12420b57cec5SDimitry Andric #endif // KMP_REVERSE_HYPER_BAR 12430b57cec5SDimitry Andric KMP_CACHE_PREFETCH( 12440b57cec5SDimitry Andric &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go); 12450b57cec5SDimitry Andric #endif /* KMP_CACHE_MANAGE */ 12460b57cec5SDimitry Andric 12470b57cec5SDimitry Andric #if KMP_BARRIER_ICV_PUSH 12480b57cec5SDimitry Andric if (propagate_icvs) // push my fixed ICVs to my child 12490b57cec5SDimitry Andric copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 12500b57cec5SDimitry Andric #endif // KMP_BARRIER_ICV_PUSH 12510b57cec5SDimitry Andric 12520b57cec5SDimitry Andric KA_TRACE( 12530b57cec5SDimitry Andric 20, 12540b57cec5SDimitry Andric ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 12550b57cec5SDimitry Andric "go(%p): %u => %u\n", 12560b57cec5SDimitry Andric gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 12570b57cec5SDimitry Andric team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 12580b57cec5SDimitry Andric child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 12590b57cec5SDimitry Andric // Release child from barrier 1260e8d8bef9SDimitry Andric kmp_flag_64<> flag(&child_bar->b_go, child_thr); 12610b57cec5SDimitry Andric flag.release(); 12620b57cec5SDimitry Andric } 12630b57cec5SDimitry Andric } 12640b57cec5SDimitry Andric } 12650b57cec5SDimitry Andric #if KMP_BARRIER_ICV_PUSH 12660b57cec5SDimitry Andric if (propagate_icvs && 12670b57cec5SDimitry Andric !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest 12680b57cec5SDimitry Andric __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, 12690b57cec5SDimitry Andric FALSE); 12700b57cec5SDimitry Andric copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 12710b57cec5SDimitry Andric &thr_bar->th_fixed_icvs); 12720b57cec5SDimitry Andric } 12730b57cec5SDimitry Andric #endif 12740b57cec5SDimitry Andric KA_TRACE( 12750b57cec5SDimitry Andric 20, 12760b57cec5SDimitry Andric ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 12770b57cec5SDimitry Andric gtid, team->t.t_id, tid, bt)); 12780b57cec5SDimitry Andric } 12790b57cec5SDimitry Andric 12800b57cec5SDimitry Andric // Hierarchical Barrier 12810b57cec5SDimitry Andric 12820b57cec5SDimitry Andric // Initialize thread barrier data 12830b57cec5SDimitry Andric /* Initializes/re-initializes the hierarchical barrier data stored on a thread. 12840b57cec5SDimitry Andric Performs the minimum amount of initialization required based on how the team 12850b57cec5SDimitry Andric has changed. Returns true if leaf children will require both on-core and 12860b57cec5SDimitry Andric traditional wake-up mechanisms. For example, if the team size increases, 12870b57cec5SDimitry Andric threads already in the team will respond to on-core wakeup on their parent 12880b57cec5SDimitry Andric thread, but threads newly added to the team will only be listening on the 12890b57cec5SDimitry Andric their local b_go. */ 12900b57cec5SDimitry Andric static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, 12910b57cec5SDimitry Andric kmp_bstate_t *thr_bar, 12920b57cec5SDimitry Andric kmp_uint32 nproc, int gtid, 12930b57cec5SDimitry Andric int tid, kmp_team_t *team) { 12940b57cec5SDimitry Andric // Checks to determine if (re-)initialization is needed 12950b57cec5SDimitry Andric bool uninitialized = thr_bar->team == NULL; 12960b57cec5SDimitry Andric bool team_changed = team != thr_bar->team; 12970b57cec5SDimitry Andric bool team_sz_changed = nproc != thr_bar->nproc; 12980b57cec5SDimitry Andric bool tid_changed = tid != thr_bar->old_tid; 12990b57cec5SDimitry Andric bool retval = false; 13000b57cec5SDimitry Andric 13010b57cec5SDimitry Andric if (uninitialized || team_sz_changed) { 13020b57cec5SDimitry Andric __kmp_get_hierarchy(nproc, thr_bar); 13030b57cec5SDimitry Andric } 13040b57cec5SDimitry Andric 13050b57cec5SDimitry Andric if (uninitialized || team_sz_changed || tid_changed) { 1306fe6060f1SDimitry Andric thr_bar->my_level = thr_bar->depth - 1; // default for primary thread 1307fe6060f1SDimitry Andric thr_bar->parent_tid = -1; // default for primary thread 1308fe6060f1SDimitry Andric if (!KMP_MASTER_TID(tid)) { 1309fe6060f1SDimitry Andric // if not primary thread, find parent thread in hierarchy 13100b57cec5SDimitry Andric kmp_uint32 d = 0; 13110b57cec5SDimitry Andric while (d < thr_bar->depth) { // find parent based on level of thread in 13120b57cec5SDimitry Andric // hierarchy, and note level 13130b57cec5SDimitry Andric kmp_uint32 rem; 1314fe6060f1SDimitry Andric if (d == thr_bar->depth - 2) { // reached level right below the primary 13150b57cec5SDimitry Andric thr_bar->parent_tid = 0; 13160b57cec5SDimitry Andric thr_bar->my_level = d; 13170b57cec5SDimitry Andric break; 1318e8d8bef9SDimitry Andric } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) { 1319e8d8bef9SDimitry Andric // TODO: can we make the above op faster? 13200b57cec5SDimitry Andric // thread is not a subtree root at next level, so this is max 13210b57cec5SDimitry Andric thr_bar->parent_tid = tid - rem; 13220b57cec5SDimitry Andric thr_bar->my_level = d; 13230b57cec5SDimitry Andric break; 13240b57cec5SDimitry Andric } 13250b57cec5SDimitry Andric ++d; 13260b57cec5SDimitry Andric } 13270b57cec5SDimitry Andric } 1328e8d8bef9SDimitry Andric __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) / 1329e8d8bef9SDimitry Andric (thr_bar->skip_per_level[thr_bar->my_level])), 1330e8d8bef9SDimitry Andric &(thr_bar->offset)); 13310b57cec5SDimitry Andric thr_bar->old_tid = tid; 13320b57cec5SDimitry Andric thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 13330b57cec5SDimitry Andric thr_bar->team = team; 13340b57cec5SDimitry Andric thr_bar->parent_bar = 13350b57cec5SDimitry Andric &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 13360b57cec5SDimitry Andric } 13370b57cec5SDimitry Andric if (uninitialized || team_changed || tid_changed) { 13380b57cec5SDimitry Andric thr_bar->team = team; 13390b57cec5SDimitry Andric thr_bar->parent_bar = 13400b57cec5SDimitry Andric &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 13410b57cec5SDimitry Andric retval = true; 13420b57cec5SDimitry Andric } 13430b57cec5SDimitry Andric if (uninitialized || team_sz_changed || tid_changed) { 13440b57cec5SDimitry Andric thr_bar->nproc = nproc; 13450b57cec5SDimitry Andric thr_bar->leaf_kids = thr_bar->base_leaf_kids; 13460b57cec5SDimitry Andric if (thr_bar->my_level == 0) 13470b57cec5SDimitry Andric thr_bar->leaf_kids = 0; 13480b57cec5SDimitry Andric if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc) 1349e8d8bef9SDimitry Andric __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids)); 13500b57cec5SDimitry Andric thr_bar->leaf_state = 0; 13510b57cec5SDimitry Andric for (int i = 0; i < thr_bar->leaf_kids; ++i) 13520b57cec5SDimitry Andric ((char *)&(thr_bar->leaf_state))[7 - i] = 1; 13530b57cec5SDimitry Andric } 13540b57cec5SDimitry Andric return retval; 13550b57cec5SDimitry Andric } 13560b57cec5SDimitry Andric 13570b57cec5SDimitry Andric static void __kmp_hierarchical_barrier_gather( 13580b57cec5SDimitry Andric enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 13590b57cec5SDimitry Andric void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 13600b57cec5SDimitry Andric KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather); 13610b57cec5SDimitry Andric kmp_team_t *team = this_thr->th.th_team; 13620b57cec5SDimitry Andric kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 13630b57cec5SDimitry Andric kmp_uint32 nproc = this_thr->th.th_team_nproc; 13640b57cec5SDimitry Andric kmp_info_t **other_threads = team->t.t_threads; 1365fe6060f1SDimitry Andric kmp_uint64 new_state = 0; 13660b57cec5SDimitry Andric 13670b57cec5SDimitry Andric int level = team->t.t_level; 13680b57cec5SDimitry Andric if (other_threads[0] 13690b57cec5SDimitry Andric ->th.th_teams_microtask) // are we inside the teams construct? 13700b57cec5SDimitry Andric if (this_thr->th.th_teams_size.nteams > 1) 13710b57cec5SDimitry Andric ++level; // level was not increased in teams construct for team_of_masters 13720b57cec5SDimitry Andric if (level == 1) 13730b57cec5SDimitry Andric thr_bar->use_oncore_barrier = 1; 13740b57cec5SDimitry Andric else 13750b57cec5SDimitry Andric thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 13760b57cec5SDimitry Andric 13770b57cec5SDimitry Andric KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for " 13780b57cec5SDimitry Andric "barrier type %d\n", 13790b57cec5SDimitry Andric gtid, team->t.t_id, tid, bt)); 13800b57cec5SDimitry Andric KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 13810b57cec5SDimitry Andric 13820b57cec5SDimitry Andric #if USE_ITT_BUILD && USE_ITT_NOTIFY 13830b57cec5SDimitry Andric // Barrier imbalance - save arrive time to the thread 13840b57cec5SDimitry Andric if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 13850b57cec5SDimitry Andric this_thr->th.th_bar_arrive_time = __itt_get_timestamp(); 13860b57cec5SDimitry Andric } 13870b57cec5SDimitry Andric #endif 13880b57cec5SDimitry Andric 13890b57cec5SDimitry Andric (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, 13900b57cec5SDimitry Andric team); 13910b57cec5SDimitry Andric 13920b57cec5SDimitry Andric if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf) 13930b57cec5SDimitry Andric kmp_int32 child_tid; 13940b57cec5SDimitry Andric new_state = 13950b57cec5SDimitry Andric (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 13960b57cec5SDimitry Andric if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 13970b57cec5SDimitry Andric thr_bar->use_oncore_barrier) { 13980b57cec5SDimitry Andric if (thr_bar->leaf_kids) { 13990b57cec5SDimitry Andric // First, wait for leaf children to check-in on my b_arrived flag 14000b57cec5SDimitry Andric kmp_uint64 leaf_state = 14010b57cec5SDimitry Andric KMP_MASTER_TID(tid) 14020b57cec5SDimitry Andric ? thr_bar->b_arrived | thr_bar->leaf_state 14030b57cec5SDimitry Andric : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state; 14040b57cec5SDimitry Andric KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting " 14050b57cec5SDimitry Andric "for leaf kids\n", 14060b57cec5SDimitry Andric gtid, team->t.t_id, tid)); 1407e8d8bef9SDimitry Andric kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state); 14080b57cec5SDimitry Andric flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 14090b57cec5SDimitry Andric if (reduce) { 1410480093f4SDimitry Andric OMPT_REDUCTION_DECL(this_thr, gtid); 1411480093f4SDimitry Andric OMPT_REDUCTION_BEGIN; 14120b57cec5SDimitry Andric for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids; 14130b57cec5SDimitry Andric ++child_tid) { 14140b57cec5SDimitry Andric KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 14150b57cec5SDimitry Andric "T#%d(%d:%d)\n", 14160b57cec5SDimitry Andric gtid, team->t.t_id, tid, 14170b57cec5SDimitry Andric __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 14180b57cec5SDimitry Andric child_tid)); 14190b57cec5SDimitry Andric (*reduce)(this_thr->th.th_local.reduce_data, 14200b57cec5SDimitry Andric other_threads[child_tid]->th.th_local.reduce_data); 14210b57cec5SDimitry Andric } 1422480093f4SDimitry Andric OMPT_REDUCTION_END; 14230b57cec5SDimitry Andric } 14240b57cec5SDimitry Andric // clear leaf_state bits 14250b57cec5SDimitry Andric KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state)); 14260b57cec5SDimitry Andric } 14270b57cec5SDimitry Andric // Next, wait for higher level children on each child's b_arrived flag 14280b57cec5SDimitry Andric for (kmp_uint32 d = 1; d < thr_bar->my_level; 14290b57cec5SDimitry Andric ++d) { // gather lowest level threads first, but skip 0 14300b57cec5SDimitry Andric kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], 14310b57cec5SDimitry Andric skip = thr_bar->skip_per_level[d]; 14320b57cec5SDimitry Andric if (last > nproc) 14330b57cec5SDimitry Andric last = nproc; 14340b57cec5SDimitry Andric for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 14350b57cec5SDimitry Andric kmp_info_t *child_thr = other_threads[child_tid]; 14360b57cec5SDimitry Andric kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 14370b57cec5SDimitry Andric KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 14380b57cec5SDimitry Andric "T#%d(%d:%d) " 14390b57cec5SDimitry Andric "arrived(%p) == %llu\n", 14400b57cec5SDimitry Andric gtid, team->t.t_id, tid, 14410b57cec5SDimitry Andric __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 14420b57cec5SDimitry Andric child_tid, &child_bar->b_arrived, new_state)); 1443e8d8bef9SDimitry Andric kmp_flag_64<> flag(&child_bar->b_arrived, new_state); 14440b57cec5SDimitry Andric flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 14450b57cec5SDimitry Andric if (reduce) { 14460b57cec5SDimitry Andric KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 14470b57cec5SDimitry Andric "T#%d(%d:%d)\n", 14480b57cec5SDimitry Andric gtid, team->t.t_id, tid, 14490b57cec5SDimitry Andric __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 14500b57cec5SDimitry Andric child_tid)); 14510b57cec5SDimitry Andric (*reduce)(this_thr->th.th_local.reduce_data, 14520b57cec5SDimitry Andric child_thr->th.th_local.reduce_data); 14530b57cec5SDimitry Andric } 14540b57cec5SDimitry Andric } 14550b57cec5SDimitry Andric } 14560b57cec5SDimitry Andric } else { // Blocktime is not infinite 14570b57cec5SDimitry Andric for (kmp_uint32 d = 0; d < thr_bar->my_level; 14580b57cec5SDimitry Andric ++d) { // Gather lowest level threads first 14590b57cec5SDimitry Andric kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], 14600b57cec5SDimitry Andric skip = thr_bar->skip_per_level[d]; 14610b57cec5SDimitry Andric if (last > nproc) 14620b57cec5SDimitry Andric last = nproc; 14630b57cec5SDimitry Andric for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 14640b57cec5SDimitry Andric kmp_info_t *child_thr = other_threads[child_tid]; 14650b57cec5SDimitry Andric kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 14660b57cec5SDimitry Andric KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 14670b57cec5SDimitry Andric "T#%d(%d:%d) " 14680b57cec5SDimitry Andric "arrived(%p) == %llu\n", 14690b57cec5SDimitry Andric gtid, team->t.t_id, tid, 14700b57cec5SDimitry Andric __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 14710b57cec5SDimitry Andric child_tid, &child_bar->b_arrived, new_state)); 1472e8d8bef9SDimitry Andric kmp_flag_64<> flag(&child_bar->b_arrived, new_state); 14730b57cec5SDimitry Andric flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 14740b57cec5SDimitry Andric if (reduce) { 14750b57cec5SDimitry Andric KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 14760b57cec5SDimitry Andric "T#%d(%d:%d)\n", 14770b57cec5SDimitry Andric gtid, team->t.t_id, tid, 14780b57cec5SDimitry Andric __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 14790b57cec5SDimitry Andric child_tid)); 14800b57cec5SDimitry Andric (*reduce)(this_thr->th.th_local.reduce_data, 14810b57cec5SDimitry Andric child_thr->th.th_local.reduce_data); 14820b57cec5SDimitry Andric } 14830b57cec5SDimitry Andric } 14840b57cec5SDimitry Andric } 14850b57cec5SDimitry Andric } 14860b57cec5SDimitry Andric } 1487fe6060f1SDimitry Andric // All subordinates are gathered; now release parent if not primary thread 14880b57cec5SDimitry Andric 14890b57cec5SDimitry Andric if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy 14900b57cec5SDimitry Andric KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing" 14910b57cec5SDimitry Andric " T#%d(%d:%d) arrived(%p): %llu => %llu\n", 14920b57cec5SDimitry Andric gtid, team->t.t_id, tid, 14930b57cec5SDimitry Andric __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, 14940b57cec5SDimitry Andric thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, 14950b57cec5SDimitry Andric thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 14960b57cec5SDimitry Andric /* Mark arrival to parent: After performing this write, a worker thread may 14970b57cec5SDimitry Andric not assume that the team is valid any more - it could be deallocated by 1498fe6060f1SDimitry Andric the primary thread at any time. */ 14990b57cec5SDimitry Andric if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || 15000b57cec5SDimitry Andric !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived 15010b57cec5SDimitry Andric // flag; release it 1502e8d8bef9SDimitry Andric kmp_flag_64<> flag(&thr_bar->b_arrived, 1503e8d8bef9SDimitry Andric other_threads[thr_bar->parent_tid]); 15040b57cec5SDimitry Andric flag.release(); 15050b57cec5SDimitry Andric } else { 15060b57cec5SDimitry Andric // Leaf does special release on "offset" bits of parent's b_arrived flag 15070b57cec5SDimitry Andric thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 1508e8d8bef9SDimitry Andric kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, 1509e8d8bef9SDimitry Andric thr_bar->offset + 1); 15100b57cec5SDimitry Andric flag.set_waiter(other_threads[thr_bar->parent_tid]); 15110b57cec5SDimitry Andric flag.release(); 15120b57cec5SDimitry Andric } 1513fe6060f1SDimitry Andric } else { // Primary thread needs to update the team's b_arrived value 15140b57cec5SDimitry Andric team->t.t_bar[bt].b_arrived = new_state; 15150b57cec5SDimitry Andric KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d " 15160b57cec5SDimitry Andric "arrived(%p) = %llu\n", 15170b57cec5SDimitry Andric gtid, team->t.t_id, tid, team->t.t_id, 15180b57cec5SDimitry Andric &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 15190b57cec5SDimitry Andric } 15200b57cec5SDimitry Andric // Is the team access below unsafe or just technically invalid? 15210b57cec5SDimitry Andric KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for " 15220b57cec5SDimitry Andric "barrier type %d\n", 15230b57cec5SDimitry Andric gtid, team->t.t_id, tid, bt)); 15240b57cec5SDimitry Andric } 15250b57cec5SDimitry Andric 15260b57cec5SDimitry Andric static void __kmp_hierarchical_barrier_release( 15270b57cec5SDimitry Andric enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 15280b57cec5SDimitry Andric int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 15290b57cec5SDimitry Andric KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release); 15300b57cec5SDimitry Andric kmp_team_t *team; 15310b57cec5SDimitry Andric kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 15320b57cec5SDimitry Andric kmp_uint32 nproc; 15330b57cec5SDimitry Andric bool team_change = false; // indicates on-core barrier shouldn't be used 15340b57cec5SDimitry Andric 15350b57cec5SDimitry Andric if (KMP_MASTER_TID(tid)) { 15360b57cec5SDimitry Andric team = __kmp_threads[gtid]->th.th_team; 15370b57cec5SDimitry Andric KMP_DEBUG_ASSERT(team != NULL); 1538fe6060f1SDimitry Andric KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary " 15390b57cec5SDimitry Andric "entered barrier type %d\n", 15400b57cec5SDimitry Andric gtid, team->t.t_id, tid, bt)); 15410b57cec5SDimitry Andric } else { // Worker threads 15420b57cec5SDimitry Andric // Wait for parent thread to release me 15430b57cec5SDimitry Andric if (!thr_bar->use_oncore_barrier || 15440b57cec5SDimitry Andric __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 || 15450b57cec5SDimitry Andric thr_bar->team == NULL) { 15460b57cec5SDimitry Andric // Use traditional method of waiting on my own b_go flag 15470b57cec5SDimitry Andric thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG; 1548e8d8bef9SDimitry Andric kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 15490b57cec5SDimitry Andric flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 15500b57cec5SDimitry Andric TCW_8(thr_bar->b_go, 15510b57cec5SDimitry Andric KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 15520b57cec5SDimitry Andric } else { // Thread barrier data is initialized, this is a leaf, blocktime is 15530b57cec5SDimitry Andric // infinite, not nested 15540b57cec5SDimitry Andric // Wait on my "offset" bits on parent's b_go flag 15550b57cec5SDimitry Andric thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG; 15560b57cec5SDimitry Andric kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, 1557e8d8bef9SDimitry Andric thr_bar->offset + 1, bt, 15580b57cec5SDimitry Andric this_thr USE_ITT_BUILD_ARG(itt_sync_obj)); 15590b57cec5SDimitry Andric flag.wait(this_thr, TRUE); 15600b57cec5SDimitry Andric if (thr_bar->wait_flag == 15610b57cec5SDimitry Andric KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go 15620b57cec5SDimitry Andric TCW_8(thr_bar->b_go, 15630b57cec5SDimitry Andric KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 15640b57cec5SDimitry Andric } else { // Reset my bits on parent's b_go flag 15650b57cec5SDimitry Andric (RCAST(volatile char *, 1566e8d8bef9SDimitry Andric &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0; 15670b57cec5SDimitry Andric } 15680b57cec5SDimitry Andric } 15690b57cec5SDimitry Andric thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 15700b57cec5SDimitry Andric // Early exit for reaping threads releasing forkjoin barrier 15710b57cec5SDimitry Andric if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 15720b57cec5SDimitry Andric return; 15730b57cec5SDimitry Andric // The worker thread may now assume that the team is valid. 15740b57cec5SDimitry Andric team = __kmp_threads[gtid]->th.th_team; 15750b57cec5SDimitry Andric KMP_DEBUG_ASSERT(team != NULL); 15760b57cec5SDimitry Andric tid = __kmp_tid_from_gtid(gtid); 15770b57cec5SDimitry Andric 15780b57cec5SDimitry Andric KA_TRACE( 15790b57cec5SDimitry Andric 20, 15800b57cec5SDimitry Andric ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 15810b57cec5SDimitry Andric gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 15820b57cec5SDimitry Andric KMP_MB(); // Flush all pending memory write invalidates. 15830b57cec5SDimitry Andric } 15840b57cec5SDimitry Andric 15850b57cec5SDimitry Andric nproc = this_thr->th.th_team_nproc; 15860b57cec5SDimitry Andric int level = team->t.t_level; 15870b57cec5SDimitry Andric if (team->t.t_threads[0] 15880b57cec5SDimitry Andric ->th.th_teams_microtask) { // are we inside the teams construct? 15890b57cec5SDimitry Andric if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 15900b57cec5SDimitry Andric this_thr->th.th_teams_level == level) 15910b57cec5SDimitry Andric ++level; // level was not increased in teams construct for team_of_workers 15920b57cec5SDimitry Andric if (this_thr->th.th_teams_size.nteams > 1) 15930b57cec5SDimitry Andric ++level; // level was not increased in teams construct for team_of_masters 15940b57cec5SDimitry Andric } 15950b57cec5SDimitry Andric if (level == 1) 15960b57cec5SDimitry Andric thr_bar->use_oncore_barrier = 1; 15970b57cec5SDimitry Andric else 15980b57cec5SDimitry Andric thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 15990b57cec5SDimitry Andric 16000b57cec5SDimitry Andric // If the team size has increased, we still communicate with old leaves via 16010b57cec5SDimitry Andric // oncore barrier. 16020b57cec5SDimitry Andric unsigned short int old_leaf_kids = thr_bar->leaf_kids; 16030b57cec5SDimitry Andric kmp_uint64 old_leaf_state = thr_bar->leaf_state; 16040b57cec5SDimitry Andric team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, 16050b57cec5SDimitry Andric tid, team); 16060b57cec5SDimitry Andric // But if the entire team changes, we won't use oncore barrier at all 16070b57cec5SDimitry Andric if (team_change) 16080b57cec5SDimitry Andric old_leaf_kids = 0; 16090b57cec5SDimitry Andric 16100b57cec5SDimitry Andric #if KMP_BARRIER_ICV_PUSH 16110b57cec5SDimitry Andric if (propagate_icvs) { 16120b57cec5SDimitry Andric __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, 16130b57cec5SDimitry Andric FALSE); 16140b57cec5SDimitry Andric if (KMP_MASTER_TID( 1615fe6060f1SDimitry Andric tid)) { // primary already has copy in final destination; copy 16160b57cec5SDimitry Andric copy_icvs(&thr_bar->th_fixed_icvs, 16170b57cec5SDimitry Andric &team->t.t_implicit_task_taskdata[tid].td_icvs); 16180b57cec5SDimitry Andric } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 16190b57cec5SDimitry Andric thr_bar->use_oncore_barrier) { // optimization for inf blocktime 16200b57cec5SDimitry Andric if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0) 16210b57cec5SDimitry Andric // leaves (on-core children) pull parent's fixed ICVs directly to local 16220b57cec5SDimitry Andric // ICV store 16230b57cec5SDimitry Andric copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 16240b57cec5SDimitry Andric &thr_bar->parent_bar->th_fixed_icvs); 16250b57cec5SDimitry Andric // non-leaves will get ICVs piggybacked with b_go via NGO store 16260b57cec5SDimitry Andric } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs 16270b57cec5SDimitry Andric if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can 16280b57cec5SDimitry Andric // access 16290b57cec5SDimitry Andric copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs); 16300b57cec5SDimitry Andric else // leaves copy parent's fixed ICVs directly to local ICV store 16310b57cec5SDimitry Andric copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 16320b57cec5SDimitry Andric &thr_bar->parent_bar->th_fixed_icvs); 16330b57cec5SDimitry Andric } 16340b57cec5SDimitry Andric } 16350b57cec5SDimitry Andric #endif // KMP_BARRIER_ICV_PUSH 16360b57cec5SDimitry Andric 16370b57cec5SDimitry Andric // Now, release my children 16380b57cec5SDimitry Andric if (thr_bar->my_level) { // not a leaf 16390b57cec5SDimitry Andric kmp_int32 child_tid; 16400b57cec5SDimitry Andric kmp_uint32 last; 16410b57cec5SDimitry Andric if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 16420b57cec5SDimitry Andric thr_bar->use_oncore_barrier) { 16430b57cec5SDimitry Andric if (KMP_MASTER_TID(tid)) { // do a flat release 16440b57cec5SDimitry Andric // Set local b_go to bump children via NGO store of the cache line 16450b57cec5SDimitry Andric // containing IVCs and b_go. 16460b57cec5SDimitry Andric thr_bar->b_go = KMP_BARRIER_STATE_BUMP; 16470b57cec5SDimitry Andric // Use ngo stores if available; b_go piggybacks in the last 8 bytes of 16480b57cec5SDimitry Andric // the cache line 16490b57cec5SDimitry Andric ngo_load(&thr_bar->th_fixed_icvs); 16500b57cec5SDimitry Andric // This loops over all the threads skipping only the leaf nodes in the 16510b57cec5SDimitry Andric // hierarchy 16520b57cec5SDimitry Andric for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc; 16530b57cec5SDimitry Andric child_tid += thr_bar->skip_per_level[1]) { 16540b57cec5SDimitry Andric kmp_bstate_t *child_bar = 16550b57cec5SDimitry Andric &team->t.t_threads[child_tid]->th.th_bar[bt].bb; 16560b57cec5SDimitry Andric KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 16570b57cec5SDimitry Andric "releasing T#%d(%d:%d)" 16580b57cec5SDimitry Andric " go(%p): %u => %u\n", 16590b57cec5SDimitry Andric gtid, team->t.t_id, tid, 16600b57cec5SDimitry Andric __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 16610b57cec5SDimitry Andric child_tid, &child_bar->b_go, child_bar->b_go, 16620b57cec5SDimitry Andric child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 16630b57cec5SDimitry Andric // Use ngo store (if available) to both store ICVs and release child 16640b57cec5SDimitry Andric // via child's b_go 16650b57cec5SDimitry Andric ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 16660b57cec5SDimitry Andric } 16670b57cec5SDimitry Andric ngo_sync(); 16680b57cec5SDimitry Andric } 16690b57cec5SDimitry Andric TCW_8(thr_bar->b_go, 16700b57cec5SDimitry Andric KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 16710b57cec5SDimitry Andric // Now, release leaf children 16720b57cec5SDimitry Andric if (thr_bar->leaf_kids) { // if there are any 16730b57cec5SDimitry Andric // We test team_change on the off-chance that the level 1 team changed. 16740b57cec5SDimitry Andric if (team_change || 16750b57cec5SDimitry Andric old_leaf_kids < thr_bar->leaf_kids) { // some old, some new 16760b57cec5SDimitry Andric if (old_leaf_kids) { // release old leaf kids 16770b57cec5SDimitry Andric thr_bar->b_go |= old_leaf_state; 16780b57cec5SDimitry Andric } 16790b57cec5SDimitry Andric // Release new leaf kids 16800b57cec5SDimitry Andric last = tid + thr_bar->skip_per_level[1]; 16810b57cec5SDimitry Andric if (last > nproc) 16820b57cec5SDimitry Andric last = nproc; 16830b57cec5SDimitry Andric for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last; 16840b57cec5SDimitry Andric ++child_tid) { // skip_per_level[0]=1 16850b57cec5SDimitry Andric kmp_info_t *child_thr = team->t.t_threads[child_tid]; 16860b57cec5SDimitry Andric kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 16870b57cec5SDimitry Andric KA_TRACE( 16880b57cec5SDimitry Andric 20, 16890b57cec5SDimitry Andric ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing" 16900b57cec5SDimitry Andric " T#%d(%d:%d) go(%p): %u => %u\n", 16910b57cec5SDimitry Andric gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 16920b57cec5SDimitry Andric team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 16930b57cec5SDimitry Andric child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 16940b57cec5SDimitry Andric // Release child using child's b_go flag 1695e8d8bef9SDimitry Andric kmp_flag_64<> flag(&child_bar->b_go, child_thr); 16960b57cec5SDimitry Andric flag.release(); 16970b57cec5SDimitry Andric } 16980b57cec5SDimitry Andric } else { // Release all children at once with leaf_state bits on my own 16990b57cec5SDimitry Andric // b_go flag 17000b57cec5SDimitry Andric thr_bar->b_go |= thr_bar->leaf_state; 17010b57cec5SDimitry Andric } 17020b57cec5SDimitry Andric } 17030b57cec5SDimitry Andric } else { // Blocktime is not infinite; do a simple hierarchical release 17040b57cec5SDimitry Andric for (int d = thr_bar->my_level - 1; d >= 0; 17050b57cec5SDimitry Andric --d) { // Release highest level threads first 17060b57cec5SDimitry Andric last = tid + thr_bar->skip_per_level[d + 1]; 17070b57cec5SDimitry Andric kmp_uint32 skip = thr_bar->skip_per_level[d]; 17080b57cec5SDimitry Andric if (last > nproc) 17090b57cec5SDimitry Andric last = nproc; 17100b57cec5SDimitry Andric for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 17110b57cec5SDimitry Andric kmp_info_t *child_thr = team->t.t_threads[child_tid]; 17120b57cec5SDimitry Andric kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 17130b57cec5SDimitry Andric KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 17140b57cec5SDimitry Andric "releasing T#%d(%d:%d) go(%p): %u => %u\n", 17150b57cec5SDimitry Andric gtid, team->t.t_id, tid, 17160b57cec5SDimitry Andric __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 17170b57cec5SDimitry Andric child_tid, &child_bar->b_go, child_bar->b_go, 17180b57cec5SDimitry Andric child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 17190b57cec5SDimitry Andric // Release child using child's b_go flag 1720e8d8bef9SDimitry Andric kmp_flag_64<> flag(&child_bar->b_go, child_thr); 17210b57cec5SDimitry Andric flag.release(); 17220b57cec5SDimitry Andric } 17230b57cec5SDimitry Andric } 17240b57cec5SDimitry Andric } 17250b57cec5SDimitry Andric #if KMP_BARRIER_ICV_PUSH 17260b57cec5SDimitry Andric if (propagate_icvs && !KMP_MASTER_TID(tid)) 17270b57cec5SDimitry Andric // non-leaves copy ICVs from fixed ICVs to local dest 17280b57cec5SDimitry Andric copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 17290b57cec5SDimitry Andric &thr_bar->th_fixed_icvs); 17300b57cec5SDimitry Andric #endif // KMP_BARRIER_ICV_PUSH 17310b57cec5SDimitry Andric } 17320b57cec5SDimitry Andric KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for " 17330b57cec5SDimitry Andric "barrier type %d\n", 17340b57cec5SDimitry Andric gtid, team->t.t_id, tid, bt)); 17350b57cec5SDimitry Andric } 17360b57cec5SDimitry Andric 17370b57cec5SDimitry Andric // End of Barrier Algorithms 17380b57cec5SDimitry Andric 17390b57cec5SDimitry Andric // type traits for cancellable value 17400b57cec5SDimitry Andric // if cancellable is true, then is_cancellable is a normal boolean variable 17410b57cec5SDimitry Andric // if cancellable is false, then is_cancellable is a compile time constant 17420b57cec5SDimitry Andric template <bool cancellable> struct is_cancellable {}; 17430b57cec5SDimitry Andric template <> struct is_cancellable<true> { 17440b57cec5SDimitry Andric bool value; 17450b57cec5SDimitry Andric is_cancellable() : value(false) {} 17460b57cec5SDimitry Andric is_cancellable(bool b) : value(b) {} 17470b57cec5SDimitry Andric is_cancellable &operator=(bool b) { 17480b57cec5SDimitry Andric value = b; 17490b57cec5SDimitry Andric return *this; 17500b57cec5SDimitry Andric } 17510b57cec5SDimitry Andric operator bool() const { return value; } 17520b57cec5SDimitry Andric }; 17530b57cec5SDimitry Andric template <> struct is_cancellable<false> { 17540b57cec5SDimitry Andric is_cancellable &operator=(bool b) { return *this; } 17550b57cec5SDimitry Andric constexpr operator bool() const { return false; } 17560b57cec5SDimitry Andric }; 17570b57cec5SDimitry Andric 17580b57cec5SDimitry Andric // Internal function to do a barrier. 17590b57cec5SDimitry Andric /* If is_split is true, do a split barrier, otherwise, do a plain barrier 17600b57cec5SDimitry Andric If reduce is non-NULL, do a split reduction barrier, otherwise, do a split 17610b57cec5SDimitry Andric barrier 17620b57cec5SDimitry Andric When cancellable = false, 1763fe6060f1SDimitry Andric Returns 0 if primary thread, 1 if worker thread. 17640b57cec5SDimitry Andric When cancellable = true 17650b57cec5SDimitry Andric Returns 0 if not cancelled, 1 if cancelled. */ 17660b57cec5SDimitry Andric template <bool cancellable = false> 17670b57cec5SDimitry Andric static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split, 17680b57cec5SDimitry Andric size_t reduce_size, void *reduce_data, 17690b57cec5SDimitry Andric void (*reduce)(void *, void *)) { 17700b57cec5SDimitry Andric KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier); 17710b57cec5SDimitry Andric KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); 17720b57cec5SDimitry Andric int tid = __kmp_tid_from_gtid(gtid); 17730b57cec5SDimitry Andric kmp_info_t *this_thr = __kmp_threads[gtid]; 17740b57cec5SDimitry Andric kmp_team_t *team = this_thr->th.th_team; 17750b57cec5SDimitry Andric int status = 0; 17760b57cec5SDimitry Andric is_cancellable<cancellable> cancelled; 17770b57cec5SDimitry Andric #if OMPT_SUPPORT && OMPT_OPTIONAL 17780b57cec5SDimitry Andric ompt_data_t *my_task_data; 17790b57cec5SDimitry Andric ompt_data_t *my_parallel_data; 17800b57cec5SDimitry Andric void *return_address; 17810b57cec5SDimitry Andric ompt_sync_region_t barrier_kind; 17820b57cec5SDimitry Andric #endif 17830b57cec5SDimitry Andric 17840b57cec5SDimitry Andric KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid, 17850b57cec5SDimitry Andric __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 17860b57cec5SDimitry Andric 17870b57cec5SDimitry Andric #if OMPT_SUPPORT 17880b57cec5SDimitry Andric if (ompt_enabled.enabled) { 17890b57cec5SDimitry Andric #if OMPT_OPTIONAL 17900b57cec5SDimitry Andric my_task_data = OMPT_CUR_TASK_DATA(this_thr); 17910b57cec5SDimitry Andric my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); 17920b57cec5SDimitry Andric return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 17930b57cec5SDimitry Andric barrier_kind = __ompt_get_barrier_kind(bt, this_thr); 17940b57cec5SDimitry Andric if (ompt_enabled.ompt_callback_sync_region) { 17950b57cec5SDimitry Andric ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 17960b57cec5SDimitry Andric barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data, 17970b57cec5SDimitry Andric return_address); 17980b57cec5SDimitry Andric } 17990b57cec5SDimitry Andric if (ompt_enabled.ompt_callback_sync_region_wait) { 18000b57cec5SDimitry Andric ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 18010b57cec5SDimitry Andric barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data, 18020b57cec5SDimitry Andric return_address); 18030b57cec5SDimitry Andric } 18040b57cec5SDimitry Andric #endif 18050b57cec5SDimitry Andric // It is OK to report the barrier state after the barrier begin callback. 18060b57cec5SDimitry Andric // According to the OMPT specification, a compliant implementation may 18070b57cec5SDimitry Andric // even delay reporting this state until the barrier begins to wait. 1808*0fca6ea1SDimitry Andric auto *ompt_thr_info = &this_thr->th.ompt_thread_info; 1809*0fca6ea1SDimitry Andric switch (barrier_kind) { 1810*0fca6ea1SDimitry Andric case ompt_sync_region_barrier_explicit: 1811*0fca6ea1SDimitry Andric ompt_thr_info->state = ompt_state_wait_barrier_explicit; 1812*0fca6ea1SDimitry Andric break; 1813*0fca6ea1SDimitry Andric case ompt_sync_region_barrier_implicit_workshare: 1814*0fca6ea1SDimitry Andric ompt_thr_info->state = ompt_state_wait_barrier_implicit_workshare; 1815*0fca6ea1SDimitry Andric break; 1816*0fca6ea1SDimitry Andric case ompt_sync_region_barrier_implicit_parallel: 1817*0fca6ea1SDimitry Andric ompt_thr_info->state = ompt_state_wait_barrier_implicit_parallel; 1818*0fca6ea1SDimitry Andric break; 1819*0fca6ea1SDimitry Andric case ompt_sync_region_barrier_teams: 1820*0fca6ea1SDimitry Andric ompt_thr_info->state = ompt_state_wait_barrier_teams; 1821*0fca6ea1SDimitry Andric break; 1822*0fca6ea1SDimitry Andric case ompt_sync_region_barrier_implementation: 1823*0fca6ea1SDimitry Andric [[fallthrough]]; 1824*0fca6ea1SDimitry Andric default: 1825*0fca6ea1SDimitry Andric ompt_thr_info->state = ompt_state_wait_barrier_implementation; 1826*0fca6ea1SDimitry Andric } 18270b57cec5SDimitry Andric } 18280b57cec5SDimitry Andric #endif 18290b57cec5SDimitry Andric 18300b57cec5SDimitry Andric if (!team->t.t_serialized) { 18310b57cec5SDimitry Andric #if USE_ITT_BUILD 18320b57cec5SDimitry Andric // This value will be used in itt notify events below. 18330b57cec5SDimitry Andric void *itt_sync_obj = NULL; 18340b57cec5SDimitry Andric #if USE_ITT_NOTIFY 18350b57cec5SDimitry Andric if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 18360b57cec5SDimitry Andric itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 18370b57cec5SDimitry Andric #endif 18380b57cec5SDimitry Andric #endif /* USE_ITT_BUILD */ 18390b57cec5SDimitry Andric if (__kmp_tasking_mode == tskm_extra_barrier) { 18400b57cec5SDimitry Andric __kmp_tasking_barrier(team, this_thr, gtid); 18410b57cec5SDimitry Andric KA_TRACE(15, 18420b57cec5SDimitry Andric ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid, 18430b57cec5SDimitry Andric __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 18440b57cec5SDimitry Andric } 18450b57cec5SDimitry Andric 18460b57cec5SDimitry Andric /* Copy the blocktime info to the thread, where __kmp_wait_template() can 18470b57cec5SDimitry Andric access it when the team struct is not guaranteed to exist. */ 18480b57cec5SDimitry Andric // See note about the corresponding code in __kmp_join_barrier() being 18490b57cec5SDimitry Andric // performance-critical. 18500b57cec5SDimitry Andric if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 18510b57cec5SDimitry Andric #if KMP_USE_MONITOR 18520b57cec5SDimitry Andric this_thr->th.th_team_bt_intervals = 18530b57cec5SDimitry Andric team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 18540b57cec5SDimitry Andric this_thr->th.th_team_bt_set = 18550b57cec5SDimitry Andric team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 18560b57cec5SDimitry Andric #else 18570b57cec5SDimitry Andric this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 18580b57cec5SDimitry Andric #endif 18590b57cec5SDimitry Andric } 18600b57cec5SDimitry Andric 18610b57cec5SDimitry Andric #if USE_ITT_BUILD 18620b57cec5SDimitry Andric if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 18630b57cec5SDimitry Andric __kmp_itt_barrier_starting(gtid, itt_sync_obj); 18640b57cec5SDimitry Andric #endif /* USE_ITT_BUILD */ 18650b57cec5SDimitry Andric #if USE_DEBUGGER 18660b57cec5SDimitry Andric // Let the debugger know: the thread arrived to the barrier and waiting. 1867fe6060f1SDimitry Andric if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct 18680b57cec5SDimitry Andric team->t.t_bar[bt].b_master_arrived += 1; 18690b57cec5SDimitry Andric } else { 18700b57cec5SDimitry Andric this_thr->th.th_bar[bt].bb.b_worker_arrived += 1; 18710b57cec5SDimitry Andric } // if 18720b57cec5SDimitry Andric #endif /* USE_DEBUGGER */ 18730b57cec5SDimitry Andric if (reduce != NULL) { 18740b57cec5SDimitry Andric // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956 18750b57cec5SDimitry Andric this_thr->th.th_local.reduce_data = reduce_data; 18760b57cec5SDimitry Andric } 18770b57cec5SDimitry Andric 18780b57cec5SDimitry Andric if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec) 1879*0fca6ea1SDimitry Andric __kmp_task_team_setup(this_thr, team); 18800b57cec5SDimitry Andric 18810b57cec5SDimitry Andric if (cancellable) { 18820b57cec5SDimitry Andric cancelled = __kmp_linear_barrier_gather_cancellable( 18830b57cec5SDimitry Andric bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 18840b57cec5SDimitry Andric } else { 18850b57cec5SDimitry Andric switch (__kmp_barrier_gather_pattern[bt]) { 1886349cc55cSDimitry Andric case bp_dist_bar: { 1887349cc55cSDimitry Andric __kmp_dist_barrier_gather(bt, this_thr, gtid, tid, 1888349cc55cSDimitry Andric reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1889349cc55cSDimitry Andric break; 1890349cc55cSDimitry Andric } 18910b57cec5SDimitry Andric case bp_hyper_bar: { 18920b57cec5SDimitry Andric // don't set branch bits to 0; use linear 18930b57cec5SDimitry Andric KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); 18940b57cec5SDimitry Andric __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, 18950b57cec5SDimitry Andric reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 18960b57cec5SDimitry Andric break; 18970b57cec5SDimitry Andric } 18980b57cec5SDimitry Andric case bp_hierarchical_bar: { 18990b57cec5SDimitry Andric __kmp_hierarchical_barrier_gather( 19000b57cec5SDimitry Andric bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 19010b57cec5SDimitry Andric break; 19020b57cec5SDimitry Andric } 19030b57cec5SDimitry Andric case bp_tree_bar: { 19040b57cec5SDimitry Andric // don't set branch bits to 0; use linear 19050b57cec5SDimitry Andric KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); 19060b57cec5SDimitry Andric __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, 19070b57cec5SDimitry Andric reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 19080b57cec5SDimitry Andric break; 19090b57cec5SDimitry Andric } 19100b57cec5SDimitry Andric default: { 19110b57cec5SDimitry Andric __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, 19120b57cec5SDimitry Andric reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 19130b57cec5SDimitry Andric } 19140b57cec5SDimitry Andric } 19150b57cec5SDimitry Andric } 19160b57cec5SDimitry Andric 19170b57cec5SDimitry Andric KMP_MB(); 19180b57cec5SDimitry Andric 19190b57cec5SDimitry Andric if (KMP_MASTER_TID(tid)) { 19200b57cec5SDimitry Andric status = 0; 19210b57cec5SDimitry Andric if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) { 19220b57cec5SDimitry Andric __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 19230b57cec5SDimitry Andric } 19240b57cec5SDimitry Andric #if USE_DEBUGGER 19250b57cec5SDimitry Andric // Let the debugger know: All threads are arrived and starting leaving the 19260b57cec5SDimitry Andric // barrier. 19270b57cec5SDimitry Andric team->t.t_bar[bt].b_team_arrived += 1; 19280b57cec5SDimitry Andric #endif 19290b57cec5SDimitry Andric 19300b57cec5SDimitry Andric if (__kmp_omp_cancellation) { 19310b57cec5SDimitry Andric kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request); 19320b57cec5SDimitry Andric // Reset cancellation flag for worksharing constructs 19330b57cec5SDimitry Andric if (cancel_request == cancel_loop || 19340b57cec5SDimitry Andric cancel_request == cancel_sections) { 19350b57cec5SDimitry Andric KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq); 19360b57cec5SDimitry Andric } 19370b57cec5SDimitry Andric } 19380b57cec5SDimitry Andric #if USE_ITT_BUILD 1939fe6060f1SDimitry Andric /* TODO: In case of split reduction barrier, primary thread may send 19400b57cec5SDimitry Andric acquired event early, before the final summation into the shared 19410b57cec5SDimitry Andric variable is done (final summation can be a long operation for array 19420b57cec5SDimitry Andric reductions). */ 19430b57cec5SDimitry Andric if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 19440b57cec5SDimitry Andric __kmp_itt_barrier_middle(gtid, itt_sync_obj); 19450b57cec5SDimitry Andric #endif /* USE_ITT_BUILD */ 19460b57cec5SDimitry Andric #if USE_ITT_BUILD && USE_ITT_NOTIFY 19470b57cec5SDimitry Andric // Barrier - report frame end (only if active_level == 1) 19480b57cec5SDimitry Andric if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 19490b57cec5SDimitry Andric __kmp_forkjoin_frames_mode && 1950e8d8bef9SDimitry Andric (this_thr->th.th_teams_microtask == NULL || // either not in teams 1951e8d8bef9SDimitry Andric this_thr->th.th_teams_size.nteams == 1) && // or inside single team 19520b57cec5SDimitry Andric team->t.t_active_level == 1) { 19530b57cec5SDimitry Andric ident_t *loc = __kmp_threads[gtid]->th.th_ident; 19540b57cec5SDimitry Andric kmp_uint64 cur_time = __itt_get_timestamp(); 19550b57cec5SDimitry Andric kmp_info_t **other_threads = team->t.t_threads; 19560b57cec5SDimitry Andric int nproc = this_thr->th.th_team_nproc; 19570b57cec5SDimitry Andric int i; 19580b57cec5SDimitry Andric switch (__kmp_forkjoin_frames_mode) { 19590b57cec5SDimitry Andric case 1: 19600b57cec5SDimitry Andric __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 19610b57cec5SDimitry Andric loc, nproc); 19620b57cec5SDimitry Andric this_thr->th.th_frame_time = cur_time; 19630b57cec5SDimitry Andric break; 19640b57cec5SDimitry Andric case 2: // AC 2015-01-19: currently does not work for hierarchical (to 19650b57cec5SDimitry Andric // be fixed) 19660b57cec5SDimitry Andric __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 19670b57cec5SDimitry Andric 1, loc, nproc); 19680b57cec5SDimitry Andric break; 19690b57cec5SDimitry Andric case 3: 19700b57cec5SDimitry Andric if (__itt_metadata_add_ptr) { 1971fe6060f1SDimitry Andric // Initialize with primary thread's wait time 19720b57cec5SDimitry Andric kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 19730b57cec5SDimitry Andric // Set arrive time to zero to be able to check it in 19740b57cec5SDimitry Andric // __kmp_invoke_task(); the same is done inside the loop below 19750b57cec5SDimitry Andric this_thr->th.th_bar_arrive_time = 0; 19760b57cec5SDimitry Andric for (i = 1; i < nproc; ++i) { 19770b57cec5SDimitry Andric delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); 19780b57cec5SDimitry Andric other_threads[i]->th.th_bar_arrive_time = 0; 19790b57cec5SDimitry Andric } 19800b57cec5SDimitry Andric __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, 19810b57cec5SDimitry Andric cur_time, delta, 19820b57cec5SDimitry Andric (kmp_uint64)(reduce != NULL)); 19830b57cec5SDimitry Andric } 19840b57cec5SDimitry Andric __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 19850b57cec5SDimitry Andric loc, nproc); 19860b57cec5SDimitry Andric this_thr->th.th_frame_time = cur_time; 19870b57cec5SDimitry Andric break; 19880b57cec5SDimitry Andric } 19890b57cec5SDimitry Andric } 19900b57cec5SDimitry Andric #endif /* USE_ITT_BUILD */ 19910b57cec5SDimitry Andric } else { 19920b57cec5SDimitry Andric status = 1; 19930b57cec5SDimitry Andric #if USE_ITT_BUILD 19940b57cec5SDimitry Andric if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 19950b57cec5SDimitry Andric __kmp_itt_barrier_middle(gtid, itt_sync_obj); 19960b57cec5SDimitry Andric #endif /* USE_ITT_BUILD */ 19970b57cec5SDimitry Andric } 19980b57cec5SDimitry Andric if ((status == 1 || !is_split) && !cancelled) { 19990b57cec5SDimitry Andric if (cancellable) { 20000b57cec5SDimitry Andric cancelled = __kmp_linear_barrier_release_cancellable( 20010b57cec5SDimitry Andric bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 20020b57cec5SDimitry Andric } else { 20030b57cec5SDimitry Andric switch (__kmp_barrier_release_pattern[bt]) { 2004349cc55cSDimitry Andric case bp_dist_bar: { 2005349cc55cSDimitry Andric KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 2006349cc55cSDimitry Andric __kmp_dist_barrier_release(bt, this_thr, gtid, tid, 2007349cc55cSDimitry Andric FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 2008349cc55cSDimitry Andric break; 2009349cc55cSDimitry Andric } 20100b57cec5SDimitry Andric case bp_hyper_bar: { 20110b57cec5SDimitry Andric KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 20120b57cec5SDimitry Andric __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, 20130b57cec5SDimitry Andric FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 20140b57cec5SDimitry Andric break; 20150b57cec5SDimitry Andric } 20160b57cec5SDimitry Andric case bp_hierarchical_bar: { 20170b57cec5SDimitry Andric __kmp_hierarchical_barrier_release( 20180b57cec5SDimitry Andric bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 20190b57cec5SDimitry Andric break; 20200b57cec5SDimitry Andric } 20210b57cec5SDimitry Andric case bp_tree_bar: { 20220b57cec5SDimitry Andric KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 20230b57cec5SDimitry Andric __kmp_tree_barrier_release(bt, this_thr, gtid, tid, 20240b57cec5SDimitry Andric FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 20250b57cec5SDimitry Andric break; 20260b57cec5SDimitry Andric } 20270b57cec5SDimitry Andric default: { 20280b57cec5SDimitry Andric __kmp_linear_barrier_release(bt, this_thr, gtid, tid, 20290b57cec5SDimitry Andric FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 20300b57cec5SDimitry Andric } 20310b57cec5SDimitry Andric } 20320b57cec5SDimitry Andric } 20330b57cec5SDimitry Andric if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) { 20340b57cec5SDimitry Andric __kmp_task_team_sync(this_thr, team); 20350b57cec5SDimitry Andric } 20360b57cec5SDimitry Andric } 20370b57cec5SDimitry Andric 20380b57cec5SDimitry Andric #if USE_ITT_BUILD 20390b57cec5SDimitry Andric /* GEH: TODO: Move this under if-condition above and also include in 20400b57cec5SDimitry Andric __kmp_end_split_barrier(). This will more accurately represent the actual 20410b57cec5SDimitry Andric release time of the threads for split barriers. */ 20420b57cec5SDimitry Andric if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 20430b57cec5SDimitry Andric __kmp_itt_barrier_finished(gtid, itt_sync_obj); 20440b57cec5SDimitry Andric #endif /* USE_ITT_BUILD */ 20450b57cec5SDimitry Andric } else { // Team is serialized. 20460b57cec5SDimitry Andric status = 0; 20470b57cec5SDimitry Andric if (__kmp_tasking_mode != tskm_immediate_exec) { 20480b57cec5SDimitry Andric if (this_thr->th.th_task_team != NULL) { 20490b57cec5SDimitry Andric #if USE_ITT_NOTIFY 20500b57cec5SDimitry Andric void *itt_sync_obj = NULL; 20510b57cec5SDimitry Andric if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 20520b57cec5SDimitry Andric itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 20530b57cec5SDimitry Andric __kmp_itt_barrier_starting(gtid, itt_sync_obj); 20540b57cec5SDimitry Andric } 20550b57cec5SDimitry Andric #endif 20560b57cec5SDimitry Andric 205704eeddc0SDimitry Andric KMP_DEBUG_ASSERT( 205804eeddc0SDimitry Andric this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE || 205904eeddc0SDimitry Andric this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered == 20600b57cec5SDimitry Andric TRUE); 20610b57cec5SDimitry Andric __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 2062*0fca6ea1SDimitry Andric __kmp_task_team_setup(this_thr, team); 20630b57cec5SDimitry Andric 20640b57cec5SDimitry Andric #if USE_ITT_BUILD 20650b57cec5SDimitry Andric if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 20660b57cec5SDimitry Andric __kmp_itt_barrier_finished(gtid, itt_sync_obj); 20670b57cec5SDimitry Andric #endif /* USE_ITT_BUILD */ 20680b57cec5SDimitry Andric } 20690b57cec5SDimitry Andric } 20700b57cec5SDimitry Andric } 20710b57cec5SDimitry Andric KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n", 20720b57cec5SDimitry Andric gtid, __kmp_team_from_gtid(gtid)->t.t_id, 20730b57cec5SDimitry Andric __kmp_tid_from_gtid(gtid), status)); 20740b57cec5SDimitry Andric 20750b57cec5SDimitry Andric #if OMPT_SUPPORT 20760b57cec5SDimitry Andric if (ompt_enabled.enabled) { 20770b57cec5SDimitry Andric #if OMPT_OPTIONAL 20780b57cec5SDimitry Andric if (ompt_enabled.ompt_callback_sync_region_wait) { 20790b57cec5SDimitry Andric ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 20800b57cec5SDimitry Andric barrier_kind, ompt_scope_end, my_parallel_data, my_task_data, 20810b57cec5SDimitry Andric return_address); 20820b57cec5SDimitry Andric } 20830b57cec5SDimitry Andric if (ompt_enabled.ompt_callback_sync_region) { 20840b57cec5SDimitry Andric ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 20850b57cec5SDimitry Andric barrier_kind, ompt_scope_end, my_parallel_data, my_task_data, 20860b57cec5SDimitry Andric return_address); 20870b57cec5SDimitry Andric } 20880b57cec5SDimitry Andric #endif 20890b57cec5SDimitry Andric this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 20900b57cec5SDimitry Andric } 20910b57cec5SDimitry Andric #endif 20920b57cec5SDimitry Andric 20930b57cec5SDimitry Andric if (cancellable) 20940b57cec5SDimitry Andric return (int)cancelled; 20950b57cec5SDimitry Andric return status; 20960b57cec5SDimitry Andric } 20970b57cec5SDimitry Andric 2098fe6060f1SDimitry Andric // Returns 0 if primary thread, 1 if worker thread. 20990b57cec5SDimitry Andric int __kmp_barrier(enum barrier_type bt, int gtid, int is_split, 21000b57cec5SDimitry Andric size_t reduce_size, void *reduce_data, 21010b57cec5SDimitry Andric void (*reduce)(void *, void *)) { 21020b57cec5SDimitry Andric return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data, 21030b57cec5SDimitry Andric reduce); 21040b57cec5SDimitry Andric } 21050b57cec5SDimitry Andric 21060b57cec5SDimitry Andric #if defined(KMP_GOMP_COMPAT) 21070b57cec5SDimitry Andric // Returns 1 if cancelled, 0 otherwise 21080b57cec5SDimitry Andric int __kmp_barrier_gomp_cancel(int gtid) { 21090b57cec5SDimitry Andric if (__kmp_omp_cancellation) { 21100b57cec5SDimitry Andric int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE, 21110b57cec5SDimitry Andric 0, NULL, NULL); 21120b57cec5SDimitry Andric if (cancelled) { 21130b57cec5SDimitry Andric int tid = __kmp_tid_from_gtid(gtid); 21140b57cec5SDimitry Andric kmp_info_t *this_thr = __kmp_threads[gtid]; 21150b57cec5SDimitry Andric if (KMP_MASTER_TID(tid)) { 2116fe6060f1SDimitry Andric // Primary thread does not need to revert anything 21170b57cec5SDimitry Andric } else { 21180b57cec5SDimitry Andric // Workers need to revert their private b_arrived flag 21190b57cec5SDimitry Andric this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -= 21200b57cec5SDimitry Andric KMP_BARRIER_STATE_BUMP; 21210b57cec5SDimitry Andric } 21220b57cec5SDimitry Andric } 21230b57cec5SDimitry Andric return cancelled; 21240b57cec5SDimitry Andric } 21250b57cec5SDimitry Andric __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 21260b57cec5SDimitry Andric return FALSE; 21270b57cec5SDimitry Andric } 21280b57cec5SDimitry Andric #endif 21290b57cec5SDimitry Andric 21300b57cec5SDimitry Andric void __kmp_end_split_barrier(enum barrier_type bt, int gtid) { 21310b57cec5SDimitry Andric KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier); 21320b57cec5SDimitry Andric KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); 2133fe6060f1SDimitry Andric KMP_DEBUG_ASSERT(bt < bs_last_barrier); 21340b57cec5SDimitry Andric int tid = __kmp_tid_from_gtid(gtid); 21350b57cec5SDimitry Andric kmp_info_t *this_thr = __kmp_threads[gtid]; 21360b57cec5SDimitry Andric kmp_team_t *team = this_thr->th.th_team; 21370b57cec5SDimitry Andric 21380b57cec5SDimitry Andric if (!team->t.t_serialized) { 21390b57cec5SDimitry Andric if (KMP_MASTER_GTID(gtid)) { 21400b57cec5SDimitry Andric switch (__kmp_barrier_release_pattern[bt]) { 2141349cc55cSDimitry Andric case bp_dist_bar: { 2142349cc55cSDimitry Andric __kmp_dist_barrier_release(bt, this_thr, gtid, tid, 2143349cc55cSDimitry Andric FALSE USE_ITT_BUILD_ARG(NULL)); 2144349cc55cSDimitry Andric break; 2145349cc55cSDimitry Andric } 21460b57cec5SDimitry Andric case bp_hyper_bar: { 21470b57cec5SDimitry Andric KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 21480b57cec5SDimitry Andric __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, 21490b57cec5SDimitry Andric FALSE USE_ITT_BUILD_ARG(NULL)); 21500b57cec5SDimitry Andric break; 21510b57cec5SDimitry Andric } 21520b57cec5SDimitry Andric case bp_hierarchical_bar: { 21530b57cec5SDimitry Andric __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, 21540b57cec5SDimitry Andric FALSE USE_ITT_BUILD_ARG(NULL)); 21550b57cec5SDimitry Andric break; 21560b57cec5SDimitry Andric } 21570b57cec5SDimitry Andric case bp_tree_bar: { 21580b57cec5SDimitry Andric KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 21590b57cec5SDimitry Andric __kmp_tree_barrier_release(bt, this_thr, gtid, tid, 21600b57cec5SDimitry Andric FALSE USE_ITT_BUILD_ARG(NULL)); 21610b57cec5SDimitry Andric break; 21620b57cec5SDimitry Andric } 21630b57cec5SDimitry Andric default: { 21640b57cec5SDimitry Andric __kmp_linear_barrier_release(bt, this_thr, gtid, tid, 21650b57cec5SDimitry Andric FALSE USE_ITT_BUILD_ARG(NULL)); 21660b57cec5SDimitry Andric } 21670b57cec5SDimitry Andric } 21680b57cec5SDimitry Andric if (__kmp_tasking_mode != tskm_immediate_exec) { 21690b57cec5SDimitry Andric __kmp_task_team_sync(this_thr, team); 21700b57cec5SDimitry Andric } // if 21710b57cec5SDimitry Andric } 21720b57cec5SDimitry Andric } 21730b57cec5SDimitry Andric } 21740b57cec5SDimitry Andric 21750b57cec5SDimitry Andric void __kmp_join_barrier(int gtid) { 21760b57cec5SDimitry Andric KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier); 21770b57cec5SDimitry Andric KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 2178fe6060f1SDimitry Andric 2179fe6060f1SDimitry Andric KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 2180fe6060f1SDimitry Andric 21810b57cec5SDimitry Andric kmp_info_t *this_thr = __kmp_threads[gtid]; 21820b57cec5SDimitry Andric kmp_team_t *team; 21830b57cec5SDimitry Andric int tid; 21840b57cec5SDimitry Andric #ifdef KMP_DEBUG 21850b57cec5SDimitry Andric int team_id; 21860b57cec5SDimitry Andric #endif /* KMP_DEBUG */ 21870b57cec5SDimitry Andric #if USE_ITT_BUILD 21880b57cec5SDimitry Andric void *itt_sync_obj = NULL; 21890b57cec5SDimitry Andric #if USE_ITT_NOTIFY 21900b57cec5SDimitry Andric if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need 21910b57cec5SDimitry Andric // Get object created at fork_barrier 21920b57cec5SDimitry Andric itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 21930b57cec5SDimitry Andric #endif 21940b57cec5SDimitry Andric #endif /* USE_ITT_BUILD */ 219581ad6265SDimitry Andric #if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG) 219681ad6265SDimitry Andric int nproc = this_thr->th.th_team_nproc; 219781ad6265SDimitry Andric #endif 21980b57cec5SDimitry Andric KMP_MB(); 21990b57cec5SDimitry Andric 22000b57cec5SDimitry Andric // Get current info 22010b57cec5SDimitry Andric team = this_thr->th.th_team; 220281ad6265SDimitry Andric KMP_DEBUG_ASSERT(nproc == team->t.t_nproc); 22030b57cec5SDimitry Andric tid = __kmp_tid_from_gtid(gtid); 22040b57cec5SDimitry Andric #ifdef KMP_DEBUG 22050b57cec5SDimitry Andric team_id = team->t.t_id; 2206349cc55cSDimitry Andric kmp_info_t *master_thread = this_thr->th.th_team_master; 22070b57cec5SDimitry Andric if (master_thread != team->t.t_threads[0]) { 22080b57cec5SDimitry Andric __kmp_print_structure(); 22090b57cec5SDimitry Andric } 22100b57cec5SDimitry Andric #endif /* KMP_DEBUG */ 22110b57cec5SDimitry Andric KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]); 22120b57cec5SDimitry Andric KMP_MB(); 22130b57cec5SDimitry Andric 22140b57cec5SDimitry Andric // Verify state 22150b57cec5SDimitry Andric KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team)); 22160b57cec5SDimitry Andric KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root)); 22170b57cec5SDimitry Andric KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]); 22180b57cec5SDimitry Andric KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", 22190b57cec5SDimitry Andric gtid, team_id, tid)); 22200b57cec5SDimitry Andric 22210b57cec5SDimitry Andric #if OMPT_SUPPORT 22220b57cec5SDimitry Andric if (ompt_enabled.enabled) { 22230b57cec5SDimitry Andric #if OMPT_OPTIONAL 22240b57cec5SDimitry Andric ompt_data_t *my_task_data; 22250b57cec5SDimitry Andric ompt_data_t *my_parallel_data; 22260b57cec5SDimitry Andric void *codeptr = NULL; 22270b57cec5SDimitry Andric int ds_tid = this_thr->th.th_info.ds.ds_tid; 22280b57cec5SDimitry Andric if (KMP_MASTER_TID(ds_tid) && 22290b57cec5SDimitry Andric (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 22300b57cec5SDimitry Andric ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 22310b57cec5SDimitry Andric codeptr = team->t.ompt_team_info.master_return_address; 22320b57cec5SDimitry Andric my_task_data = OMPT_CUR_TASK_DATA(this_thr); 22330b57cec5SDimitry Andric my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); 2234*0fca6ea1SDimitry Andric ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel; 2235*0fca6ea1SDimitry Andric ompt_state_t ompt_state = ompt_state_wait_barrier_implicit_parallel; 2236*0fca6ea1SDimitry Andric if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) { 2237*0fca6ea1SDimitry Andric sync_kind = ompt_sync_region_barrier_teams; 2238*0fca6ea1SDimitry Andric ompt_state = ompt_state_wait_barrier_teams; 2239*0fca6ea1SDimitry Andric } 22400b57cec5SDimitry Andric if (ompt_enabled.ompt_callback_sync_region) { 22410b57cec5SDimitry Andric ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2242*0fca6ea1SDimitry Andric sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr); 22430b57cec5SDimitry Andric } 22440b57cec5SDimitry Andric if (ompt_enabled.ompt_callback_sync_region_wait) { 22450b57cec5SDimitry Andric ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2246*0fca6ea1SDimitry Andric sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr); 22470b57cec5SDimitry Andric } 22480b57cec5SDimitry Andric if (!KMP_MASTER_TID(ds_tid)) 22490b57cec5SDimitry Andric this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr); 22500b57cec5SDimitry Andric #endif 2251*0fca6ea1SDimitry Andric this_thr->th.ompt_thread_info.state = ompt_state; 22520b57cec5SDimitry Andric } 22530b57cec5SDimitry Andric #endif 22540b57cec5SDimitry Andric 22550b57cec5SDimitry Andric if (__kmp_tasking_mode == tskm_extra_barrier) { 22560b57cec5SDimitry Andric __kmp_tasking_barrier(team, this_thr, gtid); 2257349cc55cSDimitry Andric KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n", 2258349cc55cSDimitry Andric gtid, team_id, tid)); 22590b57cec5SDimitry Andric } 22600b57cec5SDimitry Andric #ifdef KMP_DEBUG 22610b57cec5SDimitry Andric if (__kmp_tasking_mode != tskm_immediate_exec) { 22620b57cec5SDimitry Andric KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = " 22630b57cec5SDimitry Andric "%p, th_task_team = %p\n", 22640b57cec5SDimitry Andric __kmp_gtid_from_thread(this_thr), team_id, 22650b57cec5SDimitry Andric team->t.t_task_team[this_thr->th.th_task_state], 22660b57cec5SDimitry Andric this_thr->th.th_task_team)); 2267*0fca6ea1SDimitry Andric KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr); 22680b57cec5SDimitry Andric } 22690b57cec5SDimitry Andric #endif /* KMP_DEBUG */ 22700b57cec5SDimitry Andric 22710b57cec5SDimitry Andric /* Copy the blocktime info to the thread, where __kmp_wait_template() can 22720b57cec5SDimitry Andric access it when the team struct is not guaranteed to exist. Doing these 22730b57cec5SDimitry Andric loads causes a cache miss slows down EPCC parallel by 2x. As a workaround, 22740b57cec5SDimitry Andric we do not perform the copy if blocktime=infinite, since the values are not 22750b57cec5SDimitry Andric used by __kmp_wait_template() in that case. */ 22760b57cec5SDimitry Andric if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 22770b57cec5SDimitry Andric #if KMP_USE_MONITOR 22780b57cec5SDimitry Andric this_thr->th.th_team_bt_intervals = 22790b57cec5SDimitry Andric team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 22800b57cec5SDimitry Andric this_thr->th.th_team_bt_set = 22810b57cec5SDimitry Andric team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 22820b57cec5SDimitry Andric #else 22830b57cec5SDimitry Andric this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 22840b57cec5SDimitry Andric #endif 22850b57cec5SDimitry Andric } 22860b57cec5SDimitry Andric 22870b57cec5SDimitry Andric #if USE_ITT_BUILD 22880b57cec5SDimitry Andric if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 22890b57cec5SDimitry Andric __kmp_itt_barrier_starting(gtid, itt_sync_obj); 22900b57cec5SDimitry Andric #endif /* USE_ITT_BUILD */ 22910b57cec5SDimitry Andric 22920b57cec5SDimitry Andric switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) { 2293349cc55cSDimitry Andric case bp_dist_bar: { 2294349cc55cSDimitry Andric __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 2295349cc55cSDimitry Andric NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 2296349cc55cSDimitry Andric break; 2297349cc55cSDimitry Andric } 22980b57cec5SDimitry Andric case bp_hyper_bar: { 22990b57cec5SDimitry Andric KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 23000b57cec5SDimitry Andric __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 23010b57cec5SDimitry Andric NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 23020b57cec5SDimitry Andric break; 23030b57cec5SDimitry Andric } 23040b57cec5SDimitry Andric case bp_hierarchical_bar: { 23050b57cec5SDimitry Andric __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 23060b57cec5SDimitry Andric NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 23070b57cec5SDimitry Andric break; 23080b57cec5SDimitry Andric } 23090b57cec5SDimitry Andric case bp_tree_bar: { 23100b57cec5SDimitry Andric KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 23110b57cec5SDimitry Andric __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 23120b57cec5SDimitry Andric NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 23130b57cec5SDimitry Andric break; 23140b57cec5SDimitry Andric } 23150b57cec5SDimitry Andric default: { 23160b57cec5SDimitry Andric __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 23170b57cec5SDimitry Andric NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 23180b57cec5SDimitry Andric } 23190b57cec5SDimitry Andric } 23200b57cec5SDimitry Andric 23210b57cec5SDimitry Andric /* From this point on, the team data structure may be deallocated at any time 2322fe6060f1SDimitry Andric by the primary thread - it is unsafe to reference it in any of the worker 23230b57cec5SDimitry Andric threads. Any per-team data items that need to be referenced before the 23240b57cec5SDimitry Andric end of the barrier should be moved to the kmp_task_team_t structs. */ 23250b57cec5SDimitry Andric if (KMP_MASTER_TID(tid)) { 23260b57cec5SDimitry Andric if (__kmp_tasking_mode != tskm_immediate_exec) { 23270b57cec5SDimitry Andric __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 23280b57cec5SDimitry Andric } 23290b57cec5SDimitry Andric if (__kmp_display_affinity) { 23300b57cec5SDimitry Andric KMP_CHECK_UPDATE(team->t.t_display_affinity, 0); 23310b57cec5SDimitry Andric } 23320b57cec5SDimitry Andric #if KMP_STATS_ENABLED 2333fe6060f1SDimitry Andric // Have primary thread flag the workers to indicate they are now waiting for 23340b57cec5SDimitry Andric // next parallel region, Also wake them up so they switch their timers to 23350b57cec5SDimitry Andric // idle. 23360b57cec5SDimitry Andric for (int i = 0; i < team->t.t_nproc; ++i) { 23370b57cec5SDimitry Andric kmp_info_t *team_thread = team->t.t_threads[i]; 23380b57cec5SDimitry Andric if (team_thread == this_thr) 23390b57cec5SDimitry Andric continue; 23400b57cec5SDimitry Andric team_thread->th.th_stats->setIdleFlag(); 23410b57cec5SDimitry Andric if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && 23420b57cec5SDimitry Andric team_thread->th.th_sleep_loc != NULL) 2343349cc55cSDimitry Andric __kmp_null_resume_wrapper(team_thread); 23440b57cec5SDimitry Andric } 23450b57cec5SDimitry Andric #endif 23460b57cec5SDimitry Andric #if USE_ITT_BUILD 23470b57cec5SDimitry Andric if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 23480b57cec5SDimitry Andric __kmp_itt_barrier_middle(gtid, itt_sync_obj); 23490b57cec5SDimitry Andric #endif /* USE_ITT_BUILD */ 23500b57cec5SDimitry Andric 23510b57cec5SDimitry Andric #if USE_ITT_BUILD && USE_ITT_NOTIFY 23520b57cec5SDimitry Andric // Join barrier - report frame end 23530b57cec5SDimitry Andric if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2354e8d8bef9SDimitry Andric __kmp_forkjoin_frames_mode && 2355e8d8bef9SDimitry Andric (this_thr->th.th_teams_microtask == NULL || // either not in teams 2356e8d8bef9SDimitry Andric this_thr->th.th_teams_size.nteams == 1) && // or inside single team 23570b57cec5SDimitry Andric team->t.t_active_level == 1) { 23580b57cec5SDimitry Andric kmp_uint64 cur_time = __itt_get_timestamp(); 23590b57cec5SDimitry Andric ident_t *loc = team->t.t_ident; 23600b57cec5SDimitry Andric kmp_info_t **other_threads = team->t.t_threads; 23610b57cec5SDimitry Andric switch (__kmp_forkjoin_frames_mode) { 23620b57cec5SDimitry Andric case 1: 23630b57cec5SDimitry Andric __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 23640b57cec5SDimitry Andric loc, nproc); 23650b57cec5SDimitry Andric break; 23660b57cec5SDimitry Andric case 2: 23670b57cec5SDimitry Andric __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, 23680b57cec5SDimitry Andric loc, nproc); 23690b57cec5SDimitry Andric break; 23700b57cec5SDimitry Andric case 3: 23710b57cec5SDimitry Andric if (__itt_metadata_add_ptr) { 2372fe6060f1SDimitry Andric // Initialize with primary thread's wait time 23730b57cec5SDimitry Andric kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 23740b57cec5SDimitry Andric // Set arrive time to zero to be able to check it in 23750b57cec5SDimitry Andric // __kmp_invoke_task(); the same is done inside the loop below 23760b57cec5SDimitry Andric this_thr->th.th_bar_arrive_time = 0; 237781ad6265SDimitry Andric for (int i = 1; i < nproc; ++i) { 23780b57cec5SDimitry Andric delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); 23790b57cec5SDimitry Andric other_threads[i]->th.th_bar_arrive_time = 0; 23800b57cec5SDimitry Andric } 23810b57cec5SDimitry Andric __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, 23820b57cec5SDimitry Andric cur_time, delta, 0); 23830b57cec5SDimitry Andric } 23840b57cec5SDimitry Andric __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 23850b57cec5SDimitry Andric loc, nproc); 23860b57cec5SDimitry Andric this_thr->th.th_frame_time = cur_time; 23870b57cec5SDimitry Andric break; 23880b57cec5SDimitry Andric } 23890b57cec5SDimitry Andric } 23900b57cec5SDimitry Andric #endif /* USE_ITT_BUILD */ 23910b57cec5SDimitry Andric } 23920b57cec5SDimitry Andric #if USE_ITT_BUILD 23930b57cec5SDimitry Andric else { 23940b57cec5SDimitry Andric if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 23950b57cec5SDimitry Andric __kmp_itt_barrier_middle(gtid, itt_sync_obj); 23960b57cec5SDimitry Andric } 23970b57cec5SDimitry Andric #endif /* USE_ITT_BUILD */ 23980b57cec5SDimitry Andric 23990b57cec5SDimitry Andric #if KMP_DEBUG 24000b57cec5SDimitry Andric if (KMP_MASTER_TID(tid)) { 24010b57cec5SDimitry Andric KA_TRACE( 24020b57cec5SDimitry Andric 15, 24030b57cec5SDimitry Andric ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n", 24040b57cec5SDimitry Andric gtid, team_id, tid, nproc)); 24050b57cec5SDimitry Andric } 24060b57cec5SDimitry Andric #endif /* KMP_DEBUG */ 24070b57cec5SDimitry Andric 24080b57cec5SDimitry Andric // TODO now, mark worker threads as done so they may be disbanded 24090b57cec5SDimitry Andric KMP_MB(); // Flush all pending memory write invalidates. 24100b57cec5SDimitry Andric KA_TRACE(10, 24110b57cec5SDimitry Andric ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid)); 24120b57cec5SDimitry Andric 24130b57cec5SDimitry Andric } 24140b57cec5SDimitry Andric 24150b57cec5SDimitry Andric // TODO release worker threads' fork barriers as we are ready instead of all at 24160b57cec5SDimitry Andric // once 24170b57cec5SDimitry Andric void __kmp_fork_barrier(int gtid, int tid) { 24180b57cec5SDimitry Andric KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier); 24190b57cec5SDimitry Andric KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 24200b57cec5SDimitry Andric kmp_info_t *this_thr = __kmp_threads[gtid]; 24210b57cec5SDimitry Andric kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL; 24220b57cec5SDimitry Andric #if USE_ITT_BUILD 24230b57cec5SDimitry Andric void *itt_sync_obj = NULL; 24240b57cec5SDimitry Andric #endif /* USE_ITT_BUILD */ 24257a6dacacSDimitry Andric #ifdef KMP_DEBUG 24260b57cec5SDimitry Andric if (team) 24270b57cec5SDimitry Andric KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid, 24280b57cec5SDimitry Andric (team != NULL) ? team->t.t_id : -1, tid)); 24297a6dacacSDimitry Andric #endif 2430fe6060f1SDimitry Andric // th_team pointer only valid for primary thread here 24310b57cec5SDimitry Andric if (KMP_MASTER_TID(tid)) { 24320b57cec5SDimitry Andric #if USE_ITT_BUILD && USE_ITT_NOTIFY 24330b57cec5SDimitry Andric if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 24340b57cec5SDimitry Andric // Create itt barrier object 24350b57cec5SDimitry Andric itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1); 24360b57cec5SDimitry Andric __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing 24370b57cec5SDimitry Andric } 24380b57cec5SDimitry Andric #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 24390b57cec5SDimitry Andric 24400b57cec5SDimitry Andric #ifdef KMP_DEBUG 2441fe6060f1SDimitry Andric KMP_DEBUG_ASSERT(team); 24420b57cec5SDimitry Andric kmp_info_t **other_threads = team->t.t_threads; 24430b57cec5SDimitry Andric int i; 24440b57cec5SDimitry Andric 24450b57cec5SDimitry Andric // Verify state 24460b57cec5SDimitry Andric KMP_MB(); 24470b57cec5SDimitry Andric 24480b57cec5SDimitry Andric for (i = 1; i < team->t.t_nproc; ++i) { 24490b57cec5SDimitry Andric KA_TRACE(500, 24500b57cec5SDimitry Andric ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go " 24510b57cec5SDimitry Andric "== %u.\n", 24520b57cec5SDimitry Andric gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid, 24530b57cec5SDimitry Andric team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid, 24540b57cec5SDimitry Andric other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)); 24550b57cec5SDimitry Andric KMP_DEBUG_ASSERT( 24560b57cec5SDimitry Andric (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) & 24570b57cec5SDimitry Andric ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE); 24580b57cec5SDimitry Andric KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team); 24590b57cec5SDimitry Andric } 24600b57cec5SDimitry Andric #endif 24610b57cec5SDimitry Andric 2462*0fca6ea1SDimitry Andric if (__kmp_tasking_mode != tskm_immediate_exec) 2463*0fca6ea1SDimitry Andric __kmp_task_team_setup(this_thr, team); 24640b57cec5SDimitry Andric 2465fe6060f1SDimitry Andric /* The primary thread may have changed its blocktime between join barrier 2466fe6060f1SDimitry Andric and fork barrier. Copy the blocktime info to the thread, where 24670b57cec5SDimitry Andric __kmp_wait_template() can access it when the team struct is not 24680b57cec5SDimitry Andric guaranteed to exist. */ 24690b57cec5SDimitry Andric // See note about the corresponding code in __kmp_join_barrier() being 24700b57cec5SDimitry Andric // performance-critical 24710b57cec5SDimitry Andric if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 24720b57cec5SDimitry Andric #if KMP_USE_MONITOR 24730b57cec5SDimitry Andric this_thr->th.th_team_bt_intervals = 24740b57cec5SDimitry Andric team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 24750b57cec5SDimitry Andric this_thr->th.th_team_bt_set = 24760b57cec5SDimitry Andric team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 24770b57cec5SDimitry Andric #else 24780b57cec5SDimitry Andric this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 24790b57cec5SDimitry Andric #endif 24800b57cec5SDimitry Andric } 2481fe6060f1SDimitry Andric } // primary thread 24820b57cec5SDimitry Andric 24830b57cec5SDimitry Andric switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) { 2484349cc55cSDimitry Andric case bp_dist_bar: { 2485349cc55cSDimitry Andric __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 2486349cc55cSDimitry Andric TRUE USE_ITT_BUILD_ARG(NULL)); 2487349cc55cSDimitry Andric break; 2488349cc55cSDimitry Andric } 24890b57cec5SDimitry Andric case bp_hyper_bar: { 24900b57cec5SDimitry Andric KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 24910b57cec5SDimitry Andric __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 24920b57cec5SDimitry Andric TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 24930b57cec5SDimitry Andric break; 24940b57cec5SDimitry Andric } 24950b57cec5SDimitry Andric case bp_hierarchical_bar: { 24960b57cec5SDimitry Andric __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 24970b57cec5SDimitry Andric TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 24980b57cec5SDimitry Andric break; 24990b57cec5SDimitry Andric } 25000b57cec5SDimitry Andric case bp_tree_bar: { 25010b57cec5SDimitry Andric KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 25020b57cec5SDimitry Andric __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 25030b57cec5SDimitry Andric TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 25040b57cec5SDimitry Andric break; 25050b57cec5SDimitry Andric } 25060b57cec5SDimitry Andric default: { 25070b57cec5SDimitry Andric __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 25080b57cec5SDimitry Andric TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 25090b57cec5SDimitry Andric } 25100b57cec5SDimitry Andric } 25110b57cec5SDimitry Andric 25120b57cec5SDimitry Andric #if OMPT_SUPPORT 2513*0fca6ea1SDimitry Andric ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state; 25140b57cec5SDimitry Andric if (ompt_enabled.enabled && 2515*0fca6ea1SDimitry Andric (ompt_state == ompt_state_wait_barrier_teams || 2516*0fca6ea1SDimitry Andric ompt_state == ompt_state_wait_barrier_implicit_parallel)) { 25170b57cec5SDimitry Andric int ds_tid = this_thr->th.th_info.ds.ds_tid; 25180b57cec5SDimitry Andric ompt_data_t *task_data = (team) 25190b57cec5SDimitry Andric ? OMPT_CUR_TASK_DATA(this_thr) 25200b57cec5SDimitry Andric : &(this_thr->th.ompt_thread_info.task_data); 25210b57cec5SDimitry Andric this_thr->th.ompt_thread_info.state = ompt_state_overhead; 25220b57cec5SDimitry Andric #if OMPT_OPTIONAL 25230b57cec5SDimitry Andric void *codeptr = NULL; 25240b57cec5SDimitry Andric if (KMP_MASTER_TID(ds_tid) && 25250b57cec5SDimitry Andric (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 25260b57cec5SDimitry Andric ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 2527fe6060f1SDimitry Andric codeptr = team ? team->t.ompt_team_info.master_return_address : NULL; 2528*0fca6ea1SDimitry Andric ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel; 2529*0fca6ea1SDimitry Andric if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) 2530*0fca6ea1SDimitry Andric sync_kind = ompt_sync_region_barrier_teams; 25310b57cec5SDimitry Andric if (ompt_enabled.ompt_callback_sync_region_wait) { 25320b57cec5SDimitry Andric ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2533*0fca6ea1SDimitry Andric sync_kind, ompt_scope_end, NULL, task_data, codeptr); 25340b57cec5SDimitry Andric } 25350b57cec5SDimitry Andric if (ompt_enabled.ompt_callback_sync_region) { 25360b57cec5SDimitry Andric ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2537*0fca6ea1SDimitry Andric sync_kind, ompt_scope_end, NULL, task_data, codeptr); 25380b57cec5SDimitry Andric } 25390b57cec5SDimitry Andric #endif 25400b57cec5SDimitry Andric if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 25410b57cec5SDimitry Andric ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2542fe6060f1SDimitry Andric ompt_scope_end, NULL, task_data, 0, ds_tid, 2543fe6060f1SDimitry Andric ompt_task_implicit); // TODO: Can this be ompt_task_initial? 25440b57cec5SDimitry Andric } 25450b57cec5SDimitry Andric } 25460b57cec5SDimitry Andric #endif 25470b57cec5SDimitry Andric 25480b57cec5SDimitry Andric // Early exit for reaping threads releasing forkjoin barrier 25490b57cec5SDimitry Andric if (TCR_4(__kmp_global.g.g_done)) { 25500b57cec5SDimitry Andric this_thr->th.th_task_team = NULL; 25510b57cec5SDimitry Andric 25520b57cec5SDimitry Andric #if USE_ITT_BUILD && USE_ITT_NOTIFY 25530b57cec5SDimitry Andric if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 25540b57cec5SDimitry Andric if (!KMP_MASTER_TID(tid)) { 25550b57cec5SDimitry Andric itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 25560b57cec5SDimitry Andric if (itt_sync_obj) 25570b57cec5SDimitry Andric __kmp_itt_barrier_finished(gtid, itt_sync_obj); 25580b57cec5SDimitry Andric } 25590b57cec5SDimitry Andric } 25600b57cec5SDimitry Andric #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 25610b57cec5SDimitry Andric KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid)); 25620b57cec5SDimitry Andric return; 25630b57cec5SDimitry Andric } 25640b57cec5SDimitry Andric 25650b57cec5SDimitry Andric /* We can now assume that a valid team structure has been allocated by the 2566fe6060f1SDimitry Andric primary thread and propagated to all worker threads. The current thread, 2567fe6060f1SDimitry Andric however, may not be part of the team, so we can't blindly assume that the 2568fe6060f1SDimitry Andric team pointer is non-null. */ 25690b57cec5SDimitry Andric team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team); 25700b57cec5SDimitry Andric KMP_DEBUG_ASSERT(team != NULL); 25710b57cec5SDimitry Andric tid = __kmp_tid_from_gtid(gtid); 25720b57cec5SDimitry Andric 25730b57cec5SDimitry Andric #if KMP_BARRIER_ICV_PULL 2574fe6060f1SDimitry Andric /* Primary thread's copy of the ICVs was set up on the implicit taskdata in 2575fe6060f1SDimitry Andric __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's 25760b57cec5SDimitry Andric implicit task has this data before this function is called. We cannot 2577fe6060f1SDimitry Andric modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's 2578fe6060f1SDimitry Andric thread struct, because it is not always the case that the threads arrays 2579fe6060f1SDimitry Andric have been allocated when __kmp_fork_call() is executed. */ 25800b57cec5SDimitry Andric { 25810b57cec5SDimitry Andric KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 2582fe6060f1SDimitry Andric if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs 2583fe6060f1SDimitry Andric // Copy the initial ICVs from the primary thread's thread struct to the 2584fe6060f1SDimitry Andric // implicit task for this tid. 25850b57cec5SDimitry Andric KA_TRACE(10, 25860b57cec5SDimitry Andric ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid)); 25870b57cec5SDimitry Andric __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, 25880b57cec5SDimitry Andric tid, FALSE); 25890b57cec5SDimitry Andric copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 25900b57cec5SDimitry Andric &team->t.t_threads[0] 25910b57cec5SDimitry Andric ->th.th_bar[bs_forkjoin_barrier] 25920b57cec5SDimitry Andric .bb.th_fixed_icvs); 25930b57cec5SDimitry Andric } 25940b57cec5SDimitry Andric } 25950b57cec5SDimitry Andric #endif // KMP_BARRIER_ICV_PULL 25960b57cec5SDimitry Andric 25970b57cec5SDimitry Andric if (__kmp_tasking_mode != tskm_immediate_exec) { 25980b57cec5SDimitry Andric __kmp_task_team_sync(this_thr, team); 25990b57cec5SDimitry Andric } 26000b57cec5SDimitry Andric 26010b57cec5SDimitry Andric #if KMP_AFFINITY_SUPPORTED 26020b57cec5SDimitry Andric kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 26030b57cec5SDimitry Andric if (proc_bind == proc_bind_intel) { 26040b57cec5SDimitry Andric // Call dynamic affinity settings 2605bdd1243dSDimitry Andric if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) { 26060b57cec5SDimitry Andric __kmp_balanced_affinity(this_thr, team->t.t_nproc); 26070b57cec5SDimitry Andric } 26080b57cec5SDimitry Andric } else if (proc_bind != proc_bind_false) { 26090b57cec5SDimitry Andric if (this_thr->th.th_new_place == this_thr->th.th_current_place) { 26100b57cec5SDimitry Andric KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n", 26110b57cec5SDimitry Andric __kmp_gtid_from_thread(this_thr), 26120b57cec5SDimitry Andric this_thr->th.th_current_place)); 26130b57cec5SDimitry Andric } else { 26145f757f3fSDimitry Andric __kmp_affinity_bind_place(gtid); 26150b57cec5SDimitry Andric } 26160b57cec5SDimitry Andric } 26170b57cec5SDimitry Andric #endif // KMP_AFFINITY_SUPPORTED 26180b57cec5SDimitry Andric // Perform the display affinity functionality 26190b57cec5SDimitry Andric if (__kmp_display_affinity) { 26200b57cec5SDimitry Andric if (team->t.t_display_affinity 26210b57cec5SDimitry Andric #if KMP_AFFINITY_SUPPORTED 2622bdd1243dSDimitry Andric || (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) 26230b57cec5SDimitry Andric #endif 26240b57cec5SDimitry Andric ) { 26250b57cec5SDimitry Andric // NULL means use the affinity-format-var ICV 26260b57cec5SDimitry Andric __kmp_aux_display_affinity(gtid, NULL); 26270b57cec5SDimitry Andric this_thr->th.th_prev_num_threads = team->t.t_nproc; 26280b57cec5SDimitry Andric this_thr->th.th_prev_level = team->t.t_level; 26290b57cec5SDimitry Andric } 26300b57cec5SDimitry Andric } 26310b57cec5SDimitry Andric if (!KMP_MASTER_TID(tid)) 26320b57cec5SDimitry Andric KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator); 26330b57cec5SDimitry Andric 26340b57cec5SDimitry Andric #if USE_ITT_BUILD && USE_ITT_NOTIFY 26350b57cec5SDimitry Andric if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 26360b57cec5SDimitry Andric if (!KMP_MASTER_TID(tid)) { 26370b57cec5SDimitry Andric // Get correct barrier object 26380b57cec5SDimitry Andric itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 26390b57cec5SDimitry Andric __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired 26400b57cec5SDimitry Andric } // (prepare called inside barrier_release) 26410b57cec5SDimitry Andric } 26420b57cec5SDimitry Andric #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 26430b57cec5SDimitry Andric KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, 26440b57cec5SDimitry Andric team->t.t_id, tid)); 26450b57cec5SDimitry Andric } 26460b57cec5SDimitry Andric 26470b57cec5SDimitry Andric void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 26480b57cec5SDimitry Andric kmp_internal_control_t *new_icvs, ident_t *loc) { 26490b57cec5SDimitry Andric KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy); 26500b57cec5SDimitry Andric 26510b57cec5SDimitry Andric KMP_DEBUG_ASSERT(team && new_nproc && new_icvs); 26520b57cec5SDimitry Andric KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 26530b57cec5SDimitry Andric 2654fe6060f1SDimitry Andric /* Primary thread's copy of the ICVs was set up on the implicit taskdata in 2655fe6060f1SDimitry Andric __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's 26560b57cec5SDimitry Andric implicit task has this data before this function is called. */ 26570b57cec5SDimitry Andric #if KMP_BARRIER_ICV_PULL 2658fe6060f1SDimitry Andric /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which 2659fe6060f1SDimitry Andric remains untouched), where all of the worker threads can access them and 2660fe6060f1SDimitry Andric make their own copies after the barrier. */ 26610b57cec5SDimitry Andric KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be 26620b57cec5SDimitry Andric // allocated at this point 26630b57cec5SDimitry Andric copy_icvs( 26640b57cec5SDimitry Andric &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, 26650b57cec5SDimitry Andric new_icvs); 26660b57cec5SDimitry Andric KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0, 26670b57cec5SDimitry Andric team->t.t_threads[0], team)); 26680b57cec5SDimitry Andric #elif KMP_BARRIER_ICV_PUSH 26690b57cec5SDimitry Andric // The ICVs will be propagated in the fork barrier, so nothing needs to be 26700b57cec5SDimitry Andric // done here. 26710b57cec5SDimitry Andric KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0, 26720b57cec5SDimitry Andric team->t.t_threads[0], team)); 26730b57cec5SDimitry Andric #else 2674fe6060f1SDimitry Andric // Copy the ICVs to each of the non-primary threads. This takes O(nthreads) 26750b57cec5SDimitry Andric // time. 26760b57cec5SDimitry Andric ngo_load(new_icvs); 26770b57cec5SDimitry Andric KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be 26780b57cec5SDimitry Andric // allocated at this point 2679fe6060f1SDimitry Andric for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread 26800b57cec5SDimitry Andric // TODO: GEH - pass in better source location info since usually NULL here 26810b57cec5SDimitry Andric KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 26820b57cec5SDimitry Andric f, team->t.t_threads[f], team)); 26830b57cec5SDimitry Andric __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE); 26840b57cec5SDimitry Andric ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs); 26850b57cec5SDimitry Andric KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 26860b57cec5SDimitry Andric f, team->t.t_threads[f], team)); 26870b57cec5SDimitry Andric } 26880b57cec5SDimitry Andric ngo_sync(); 26890b57cec5SDimitry Andric #endif // KMP_BARRIER_ICV_PULL 26900b57cec5SDimitry Andric } 2691