1 /* 2 * kmp_barrier.cpp 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_wait_release.h" 15 #include "kmp_itt.h" 16 #include "kmp_os.h" 17 #include "kmp_stats.h" 18 #if OMPT_SUPPORT 19 #include "ompt-specific.h" 20 #endif 21 22 #if KMP_MIC 23 #include <immintrin.h> 24 #define USE_NGO_STORES 1 25 #endif // KMP_MIC 26 27 #include "tsan_annotations.h" 28 29 #if KMP_MIC && USE_NGO_STORES 30 // ICV copying 31 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src)) 32 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 33 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 34 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory") 35 #else 36 #define ngo_load(src) ((void)0) 37 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src)) 38 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE) 39 #define ngo_sync() ((void)0) 40 #endif /* KMP_MIC && USE_NGO_STORES */ 41 42 void __kmp_print_structure(void); // Forward declaration 43 44 // ---------------------------- Barrier Algorithms ---------------------------- 45 46 // Linear Barrier 47 template <bool cancellable = false> 48 static bool __kmp_linear_barrier_gather_template( 49 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 50 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 51 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather); 52 kmp_team_t *team = this_thr->th.th_team; 53 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 54 kmp_info_t **other_threads = team->t.t_threads; 55 56 KA_TRACE( 57 20, 58 ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 59 gtid, team->t.t_id, tid, bt)); 60 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 61 62 #if USE_ITT_BUILD && USE_ITT_NOTIFY 63 // Barrier imbalance - save arrive time to the thread 64 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 65 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 66 __itt_get_timestamp(); 67 } 68 #endif 69 // We now perform a linear reduction to signal that all of the threads have 70 // arrived. 71 if (!KMP_MASTER_TID(tid)) { 72 KA_TRACE(20, 73 ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)" 74 "arrived(%p): %llu => %llu\n", 75 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team), 76 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived, 77 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 78 // Mark arrival to master thread 79 /* After performing this write, a worker thread may not assume that the team 80 is valid any more - it could be deallocated by the master thread at any 81 time. */ 82 ANNOTATE_BARRIER_BEGIN(this_thr); 83 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]); 84 flag.release(); 85 } else { 86 kmp_balign_team_t *team_bar = &team->t.t_bar[bt]; 87 int nproc = this_thr->th.th_team_nproc; 88 int i; 89 // Don't have to worry about sleep bit here or atomic since team setting 90 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP; 91 92 // Collect all the worker team member threads. 93 for (i = 1; i < nproc; ++i) { 94 #if KMP_CACHE_MANAGE 95 // Prefetch next thread's arrived count 96 if (i + 1 < nproc) 97 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived); 98 #endif /* KMP_CACHE_MANAGE */ 99 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " 100 "arrived(%p) == %llu\n", 101 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), 102 team->t.t_id, i, 103 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state)); 104 105 // Wait for worker thread to arrive 106 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, 107 new_state); 108 if (cancellable) { 109 bool cancelled = flag.wait_cancellable_nosleep( 110 this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 111 if (cancelled) 112 return true; 113 } else { 114 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 115 } 116 ANNOTATE_BARRIER_END(other_threads[i]); 117 #if USE_ITT_BUILD && USE_ITT_NOTIFY 118 // Barrier imbalance - write min of the thread time and the other thread 119 // time to the thread. 120 if (__kmp_forkjoin_frames_mode == 2) { 121 this_thr->th.th_bar_min_time = KMP_MIN( 122 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time); 123 } 124 #endif 125 if (reduce) { 126 KA_TRACE(100, 127 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", 128 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), 129 team->t.t_id, i)); 130 ANNOTATE_REDUCE_AFTER(reduce); 131 (*reduce)(this_thr->th.th_local.reduce_data, 132 other_threads[i]->th.th_local.reduce_data); 133 ANNOTATE_REDUCE_BEFORE(reduce); 134 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 135 } 136 } 137 // Don't have to worry about sleep bit here or atomic since team setting 138 team_bar->b_arrived = new_state; 139 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d " 140 "arrived(%p) = %llu\n", 141 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, 142 new_state)); 143 } 144 KA_TRACE( 145 20, 146 ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 147 gtid, team->t.t_id, tid, bt)); 148 return false; 149 } 150 151 template <bool cancellable = false> 152 static bool __kmp_linear_barrier_release_template( 153 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 154 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 155 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release); 156 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 157 kmp_team_t *team; 158 159 if (KMP_MASTER_TID(tid)) { 160 unsigned int i; 161 kmp_uint32 nproc = this_thr->th.th_team_nproc; 162 kmp_info_t **other_threads; 163 164 team = __kmp_threads[gtid]->th.th_team; 165 KMP_DEBUG_ASSERT(team != NULL); 166 other_threads = team->t.t_threads; 167 168 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for " 169 "barrier type %d\n", 170 gtid, team->t.t_id, tid, bt)); 171 172 if (nproc > 1) { 173 #if KMP_BARRIER_ICV_PUSH 174 { 175 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 176 if (propagate_icvs) { 177 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs); 178 for (i = 1; i < nproc; ++i) { 179 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], 180 team, i, FALSE); 181 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, 182 &team->t.t_implicit_task_taskdata[0].td_icvs); 183 } 184 ngo_sync(); 185 } 186 } 187 #endif // KMP_BARRIER_ICV_PUSH 188 189 // Now, release all of the worker threads 190 for (i = 1; i < nproc; ++i) { 191 #if KMP_CACHE_MANAGE 192 // Prefetch next thread's go flag 193 if (i + 1 < nproc) 194 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go); 195 #endif /* KMP_CACHE_MANAGE */ 196 KA_TRACE( 197 20, 198 ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) " 199 "go(%p): %u => %u\n", 200 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid, 201 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go, 202 other_threads[i]->th.th_bar[bt].bb.b_go, 203 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP)); 204 ANNOTATE_BARRIER_BEGIN(other_threads[i]); 205 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, 206 other_threads[i]); 207 flag.release(); 208 } 209 } 210 } else { // Wait for the MASTER thread to release us 211 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n", 212 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 213 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 214 if (cancellable) { 215 bool cancelled = flag.wait_cancellable_nosleep( 216 this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 217 if (cancelled) { 218 return true; 219 } 220 } else { 221 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 222 } 223 ANNOTATE_BARRIER_END(this_thr); 224 #if USE_ITT_BUILD && USE_ITT_NOTIFY 225 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 226 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is 227 // disabled) 228 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 229 // Cancel wait on previous parallel region... 230 __kmp_itt_task_starting(itt_sync_obj); 231 232 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 233 return false; 234 235 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 236 if (itt_sync_obj != NULL) 237 // Call prepare as early as possible for "new" barrier 238 __kmp_itt_task_finished(itt_sync_obj); 239 } else 240 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 241 // Early exit for reaping threads releasing forkjoin barrier 242 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 243 return false; 244 // The worker thread may now assume that the team is valid. 245 #ifdef KMP_DEBUG 246 tid = __kmp_tid_from_gtid(gtid); 247 team = __kmp_threads[gtid]->th.th_team; 248 #endif 249 KMP_DEBUG_ASSERT(team != NULL); 250 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 251 KA_TRACE(20, 252 ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 253 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 254 KMP_MB(); // Flush all pending memory write invalidates. 255 } 256 KA_TRACE( 257 20, 258 ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 259 gtid, team->t.t_id, tid, bt)); 260 return false; 261 } 262 263 static void __kmp_linear_barrier_gather( 264 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 265 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 266 __kmp_linear_barrier_gather_template<false>( 267 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 268 } 269 270 static bool __kmp_linear_barrier_gather_cancellable( 271 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 272 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 273 return __kmp_linear_barrier_gather_template<true>( 274 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 275 } 276 277 static void __kmp_linear_barrier_release( 278 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 279 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 280 __kmp_linear_barrier_release_template<false>( 281 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); 282 } 283 284 static bool __kmp_linear_barrier_release_cancellable( 285 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 286 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 287 return __kmp_linear_barrier_release_template<true>( 288 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); 289 } 290 291 // Tree barrier 292 static void 293 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, 294 int tid, void (*reduce)(void *, void *) 295 USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 296 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather); 297 kmp_team_t *team = this_thr->th.th_team; 298 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 299 kmp_info_t **other_threads = team->t.t_threads; 300 kmp_uint32 nproc = this_thr->th.th_team_nproc; 301 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 302 kmp_uint32 branch_factor = 1 << branch_bits; 303 kmp_uint32 child; 304 kmp_uint32 child_tid; 305 kmp_uint64 new_state; 306 307 KA_TRACE( 308 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 309 gtid, team->t.t_id, tid, bt)); 310 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 311 312 #if USE_ITT_BUILD && USE_ITT_NOTIFY 313 // Barrier imbalance - save arrive time to the thread 314 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 315 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 316 __itt_get_timestamp(); 317 } 318 #endif 319 // Perform tree gather to wait until all threads have arrived; reduce any 320 // required data as we go 321 child_tid = (tid << branch_bits) + 1; 322 if (child_tid < nproc) { 323 // Parent threads wait for all their children to arrive 324 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 325 child = 1; 326 do { 327 kmp_info_t *child_thr = other_threads[child_tid]; 328 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 329 #if KMP_CACHE_MANAGE 330 // Prefetch next thread's arrived count 331 if (child + 1 <= branch_factor && child_tid + 1 < nproc) 332 KMP_CACHE_PREFETCH( 333 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived); 334 #endif /* KMP_CACHE_MANAGE */ 335 KA_TRACE(20, 336 ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 337 "arrived(%p) == %llu\n", 338 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 339 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 340 // Wait for child to arrive 341 kmp_flag_64 flag(&child_bar->b_arrived, new_state); 342 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 343 ANNOTATE_BARRIER_END(child_thr); 344 #if USE_ITT_BUILD && USE_ITT_NOTIFY 345 // Barrier imbalance - write min of the thread time and a child time to 346 // the thread. 347 if (__kmp_forkjoin_frames_mode == 2) { 348 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 349 child_thr->th.th_bar_min_time); 350 } 351 #endif 352 if (reduce) { 353 KA_TRACE(100, 354 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 355 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 356 team->t.t_id, child_tid)); 357 ANNOTATE_REDUCE_AFTER(reduce); 358 (*reduce)(this_thr->th.th_local.reduce_data, 359 child_thr->th.th_local.reduce_data); 360 ANNOTATE_REDUCE_BEFORE(reduce); 361 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 362 } 363 child++; 364 child_tid++; 365 } while (child <= branch_factor && child_tid < nproc); 366 } 367 368 if (!KMP_MASTER_TID(tid)) { // Worker threads 369 kmp_int32 parent_tid = (tid - 1) >> branch_bits; 370 371 KA_TRACE(20, 372 ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 373 "arrived(%p): %llu => %llu\n", 374 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), 375 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, 376 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 377 378 // Mark arrival to parent thread 379 /* After performing this write, a worker thread may not assume that the team 380 is valid any more - it could be deallocated by the master thread at any 381 time. */ 382 ANNOTATE_BARRIER_BEGIN(this_thr); 383 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]); 384 flag.release(); 385 } else { 386 // Need to update the team arrived pointer if we are the master thread 387 if (nproc > 1) // New value was already computed above 388 team->t.t_bar[bt].b_arrived = new_state; 389 else 390 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 391 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d " 392 "arrived(%p) = %llu\n", 393 gtid, team->t.t_id, tid, team->t.t_id, 394 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 395 } 396 KA_TRACE(20, 397 ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 398 gtid, team->t.t_id, tid, bt)); 399 } 400 401 static void __kmp_tree_barrier_release( 402 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 403 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 404 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release); 405 kmp_team_t *team; 406 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 407 kmp_uint32 nproc; 408 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; 409 kmp_uint32 branch_factor = 1 << branch_bits; 410 kmp_uint32 child; 411 kmp_uint32 child_tid; 412 413 // Perform a tree release for all of the threads that have been gathered 414 if (!KMP_MASTER_TID( 415 tid)) { // Handle fork barrier workers who aren't part of a team yet 416 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid, 417 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 418 // Wait for parent thread to release us 419 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 420 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 421 ANNOTATE_BARRIER_END(this_thr); 422 #if USE_ITT_BUILD && USE_ITT_NOTIFY 423 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 424 // In fork barrier where we could not get the object reliably (or 425 // ITTNOTIFY is disabled) 426 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 427 // Cancel wait on previous parallel region... 428 __kmp_itt_task_starting(itt_sync_obj); 429 430 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 431 return; 432 433 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 434 if (itt_sync_obj != NULL) 435 // Call prepare as early as possible for "new" barrier 436 __kmp_itt_task_finished(itt_sync_obj); 437 } else 438 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 439 // Early exit for reaping threads releasing forkjoin barrier 440 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 441 return; 442 443 // The worker thread may now assume that the team is valid. 444 team = __kmp_threads[gtid]->th.th_team; 445 KMP_DEBUG_ASSERT(team != NULL); 446 tid = __kmp_tid_from_gtid(gtid); 447 448 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 449 KA_TRACE(20, 450 ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid, 451 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 452 KMP_MB(); // Flush all pending memory write invalidates. 453 } else { 454 team = __kmp_threads[gtid]->th.th_team; 455 KMP_DEBUG_ASSERT(team != NULL); 456 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for " 457 "barrier type %d\n", 458 gtid, team->t.t_id, tid, bt)); 459 } 460 nproc = this_thr->th.th_team_nproc; 461 child_tid = (tid << branch_bits) + 1; 462 463 if (child_tid < nproc) { 464 kmp_info_t **other_threads = team->t.t_threads; 465 child = 1; 466 // Parent threads release all their children 467 do { 468 kmp_info_t *child_thr = other_threads[child_tid]; 469 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 470 #if KMP_CACHE_MANAGE 471 // Prefetch next thread's go count 472 if (child + 1 <= branch_factor && child_tid + 1 < nproc) 473 KMP_CACHE_PREFETCH( 474 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go); 475 #endif /* KMP_CACHE_MANAGE */ 476 477 #if KMP_BARRIER_ICV_PUSH 478 { 479 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 480 if (propagate_icvs) { 481 __kmp_init_implicit_task(team->t.t_ident, 482 team->t.t_threads[child_tid], team, 483 child_tid, FALSE); 484 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, 485 &team->t.t_implicit_task_taskdata[0].td_icvs); 486 } 487 } 488 #endif // KMP_BARRIER_ICV_PUSH 489 KA_TRACE(20, 490 ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 491 "go(%p): %u => %u\n", 492 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 493 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 494 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 495 // Release child from barrier 496 ANNOTATE_BARRIER_BEGIN(child_thr); 497 kmp_flag_64 flag(&child_bar->b_go, child_thr); 498 flag.release(); 499 child++; 500 child_tid++; 501 } while (child <= branch_factor && child_tid < nproc); 502 } 503 KA_TRACE( 504 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 505 gtid, team->t.t_id, tid, bt)); 506 } 507 508 // Hyper Barrier 509 static void 510 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, 511 int tid, void (*reduce)(void *, void *) 512 USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 513 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather); 514 kmp_team_t *team = this_thr->th.th_team; 515 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 516 kmp_info_t **other_threads = team->t.t_threads; 517 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE; 518 kmp_uint32 num_threads = this_thr->th.th_team_nproc; 519 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 520 kmp_uint32 branch_factor = 1 << branch_bits; 521 kmp_uint32 offset; 522 kmp_uint32 level; 523 524 KA_TRACE( 525 20, 526 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 527 gtid, team->t.t_id, tid, bt)); 528 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 529 530 #if USE_ITT_BUILD && USE_ITT_NOTIFY 531 // Barrier imbalance - save arrive time to the thread 532 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 533 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 534 __itt_get_timestamp(); 535 } 536 #endif 537 /* Perform a hypercube-embedded tree gather to wait until all of the threads 538 have arrived, and reduce any required data as we go. */ 539 kmp_flag_64 p_flag(&thr_bar->b_arrived); 540 for (level = 0, offset = 1; offset < num_threads; 541 level += branch_bits, offset <<= branch_bits) { 542 kmp_uint32 child; 543 kmp_uint32 child_tid; 544 545 if (((tid >> level) & (branch_factor - 1)) != 0) { 546 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1); 547 548 KA_TRACE(20, 549 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 550 "arrived(%p): %llu => %llu\n", 551 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), 552 team->t.t_id, parent_tid, &thr_bar->b_arrived, 553 thr_bar->b_arrived, 554 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 555 // Mark arrival to parent thread 556 /* After performing this write (in the last iteration of the enclosing for 557 loop), a worker thread may not assume that the team is valid any more 558 - it could be deallocated by the master thread at any time. */ 559 ANNOTATE_BARRIER_BEGIN(this_thr); 560 p_flag.set_waiter(other_threads[parent_tid]); 561 p_flag.release(); 562 break; 563 } 564 565 // Parent threads wait for children to arrive 566 if (new_state == KMP_BARRIER_UNUSED_STATE) 567 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 568 for (child = 1, child_tid = tid + (1 << level); 569 child < branch_factor && child_tid < num_threads; 570 child++, child_tid += (1 << level)) { 571 kmp_info_t *child_thr = other_threads[child_tid]; 572 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 573 #if KMP_CACHE_MANAGE 574 kmp_uint32 next_child_tid = child_tid + (1 << level); 575 // Prefetch next thread's arrived count 576 if (child + 1 < branch_factor && next_child_tid < num_threads) 577 KMP_CACHE_PREFETCH( 578 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived); 579 #endif /* KMP_CACHE_MANAGE */ 580 KA_TRACE(20, 581 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 582 "arrived(%p) == %llu\n", 583 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 584 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 585 // Wait for child to arrive 586 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state); 587 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 588 ANNOTATE_BARRIER_END(child_thr); 589 #if USE_ITT_BUILD && USE_ITT_NOTIFY 590 // Barrier imbalance - write min of the thread time and a child time to 591 // the thread. 592 if (__kmp_forkjoin_frames_mode == 2) { 593 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 594 child_thr->th.th_bar_min_time); 595 } 596 #endif 597 if (reduce) { 598 KA_TRACE(100, 599 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 600 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 601 team->t.t_id, child_tid)); 602 ANNOTATE_REDUCE_AFTER(reduce); 603 (*reduce)(this_thr->th.th_local.reduce_data, 604 child_thr->th.th_local.reduce_data); 605 ANNOTATE_REDUCE_BEFORE(reduce); 606 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 607 } 608 } 609 } 610 611 if (KMP_MASTER_TID(tid)) { 612 // Need to update the team arrived pointer if we are the master thread 613 if (new_state == KMP_BARRIER_UNUSED_STATE) 614 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 615 else 616 team->t.t_bar[bt].b_arrived = new_state; 617 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d " 618 "arrived(%p) = %llu\n", 619 gtid, team->t.t_id, tid, team->t.t_id, 620 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 621 } 622 KA_TRACE( 623 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 624 gtid, team->t.t_id, tid, bt)); 625 } 626 627 // The reverse versions seem to beat the forward versions overall 628 #define KMP_REVERSE_HYPER_BAR 629 static void __kmp_hyper_barrier_release( 630 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 631 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 632 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release); 633 kmp_team_t *team; 634 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 635 kmp_info_t **other_threads; 636 kmp_uint32 num_threads; 637 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; 638 kmp_uint32 branch_factor = 1 << branch_bits; 639 kmp_uint32 child; 640 kmp_uint32 child_tid; 641 kmp_uint32 offset; 642 kmp_uint32 level; 643 644 /* Perform a hypercube-embedded tree release for all of the threads that have 645 been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads 646 are released in the reverse order of the corresponding gather, otherwise 647 threads are released in the same order. */ 648 if (KMP_MASTER_TID(tid)) { // master 649 team = __kmp_threads[gtid]->th.th_team; 650 KMP_DEBUG_ASSERT(team != NULL); 651 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for " 652 "barrier type %d\n", 653 gtid, team->t.t_id, tid, bt)); 654 #if KMP_BARRIER_ICV_PUSH 655 if (propagate_icvs) { // master already has ICVs in final destination; copy 656 copy_icvs(&thr_bar->th_fixed_icvs, 657 &team->t.t_implicit_task_taskdata[tid].td_icvs); 658 } 659 #endif 660 } else { // Handle fork barrier workers who aren't part of a team yet 661 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid, 662 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 663 // Wait for parent thread to release us 664 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 665 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 666 ANNOTATE_BARRIER_END(this_thr); 667 #if USE_ITT_BUILD && USE_ITT_NOTIFY 668 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 669 // In fork barrier where we could not get the object reliably 670 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 671 // Cancel wait on previous parallel region... 672 __kmp_itt_task_starting(itt_sync_obj); 673 674 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 675 return; 676 677 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 678 if (itt_sync_obj != NULL) 679 // Call prepare as early as possible for "new" barrier 680 __kmp_itt_task_finished(itt_sync_obj); 681 } else 682 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 683 // Early exit for reaping threads releasing forkjoin barrier 684 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 685 return; 686 687 // The worker thread may now assume that the team is valid. 688 team = __kmp_threads[gtid]->th.th_team; 689 KMP_DEBUG_ASSERT(team != NULL); 690 tid = __kmp_tid_from_gtid(gtid); 691 692 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 693 KA_TRACE(20, 694 ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 695 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 696 KMP_MB(); // Flush all pending memory write invalidates. 697 } 698 num_threads = this_thr->th.th_team_nproc; 699 other_threads = team->t.t_threads; 700 701 #ifdef KMP_REVERSE_HYPER_BAR 702 // Count up to correct level for parent 703 for (level = 0, offset = 1; 704 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0); 705 level += branch_bits, offset <<= branch_bits) 706 ; 707 708 // Now go down from there 709 for (level -= branch_bits, offset >>= branch_bits; offset != 0; 710 level -= branch_bits, offset >>= branch_bits) 711 #else 712 // Go down the tree, level by level 713 for (level = 0, offset = 1; offset < num_threads; 714 level += branch_bits, offset <<= branch_bits) 715 #endif // KMP_REVERSE_HYPER_BAR 716 { 717 #ifdef KMP_REVERSE_HYPER_BAR 718 /* Now go in reverse order through the children, highest to lowest. 719 Initial setting of child is conservative here. */ 720 child = num_threads >> ((level == 0) ? level : level - 1); 721 for (child = (child < branch_factor - 1) ? child : branch_factor - 1, 722 child_tid = tid + (child << level); 723 child >= 1; child--, child_tid -= (1 << level)) 724 #else 725 if (((tid >> level) & (branch_factor - 1)) != 0) 726 // No need to go lower than this, since this is the level parent would be 727 // notified 728 break; 729 // Iterate through children on this level of the tree 730 for (child = 1, child_tid = tid + (1 << level); 731 child < branch_factor && child_tid < num_threads; 732 child++, child_tid += (1 << level)) 733 #endif // KMP_REVERSE_HYPER_BAR 734 { 735 if (child_tid >= num_threads) 736 continue; // Child doesn't exist so keep going 737 else { 738 kmp_info_t *child_thr = other_threads[child_tid]; 739 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 740 #if KMP_CACHE_MANAGE 741 kmp_uint32 next_child_tid = child_tid - (1 << level); 742 // Prefetch next thread's go count 743 #ifdef KMP_REVERSE_HYPER_BAR 744 if (child - 1 >= 1 && next_child_tid < num_threads) 745 #else 746 if (child + 1 < branch_factor && next_child_tid < num_threads) 747 #endif // KMP_REVERSE_HYPER_BAR 748 KMP_CACHE_PREFETCH( 749 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go); 750 #endif /* KMP_CACHE_MANAGE */ 751 752 #if KMP_BARRIER_ICV_PUSH 753 if (propagate_icvs) // push my fixed ICVs to my child 754 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 755 #endif // KMP_BARRIER_ICV_PUSH 756 757 KA_TRACE( 758 20, 759 ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 760 "go(%p): %u => %u\n", 761 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 762 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 763 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 764 // Release child from barrier 765 ANNOTATE_BARRIER_BEGIN(child_thr); 766 kmp_flag_64 flag(&child_bar->b_go, child_thr); 767 flag.release(); 768 } 769 } 770 } 771 #if KMP_BARRIER_ICV_PUSH 772 if (propagate_icvs && 773 !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest 774 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, 775 FALSE); 776 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 777 &thr_bar->th_fixed_icvs); 778 } 779 #endif 780 KA_TRACE( 781 20, 782 ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 783 gtid, team->t.t_id, tid, bt)); 784 } 785 786 // Hierarchical Barrier 787 788 // Initialize thread barrier data 789 /* Initializes/re-initializes the hierarchical barrier data stored on a thread. 790 Performs the minimum amount of initialization required based on how the team 791 has changed. Returns true if leaf children will require both on-core and 792 traditional wake-up mechanisms. For example, if the team size increases, 793 threads already in the team will respond to on-core wakeup on their parent 794 thread, but threads newly added to the team will only be listening on the 795 their local b_go. */ 796 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, 797 kmp_bstate_t *thr_bar, 798 kmp_uint32 nproc, int gtid, 799 int tid, kmp_team_t *team) { 800 // Checks to determine if (re-)initialization is needed 801 bool uninitialized = thr_bar->team == NULL; 802 bool team_changed = team != thr_bar->team; 803 bool team_sz_changed = nproc != thr_bar->nproc; 804 bool tid_changed = tid != thr_bar->old_tid; 805 bool retval = false; 806 807 if (uninitialized || team_sz_changed) { 808 __kmp_get_hierarchy(nproc, thr_bar); 809 } 810 811 if (uninitialized || team_sz_changed || tid_changed) { 812 thr_bar->my_level = thr_bar->depth - 1; // default for master 813 thr_bar->parent_tid = -1; // default for master 814 if (!KMP_MASTER_TID( 815 tid)) { // if not master, find parent thread in hierarchy 816 kmp_uint32 d = 0; 817 while (d < thr_bar->depth) { // find parent based on level of thread in 818 // hierarchy, and note level 819 kmp_uint32 rem; 820 if (d == thr_bar->depth - 2) { // reached level right below the master 821 thr_bar->parent_tid = 0; 822 thr_bar->my_level = d; 823 break; 824 } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 825 0) { // TODO: can we make this op faster? 826 // thread is not a subtree root at next level, so this is max 827 thr_bar->parent_tid = tid - rem; 828 thr_bar->my_level = d; 829 break; 830 } 831 ++d; 832 } 833 } 834 thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1); 835 thr_bar->old_tid = tid; 836 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 837 thr_bar->team = team; 838 thr_bar->parent_bar = 839 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 840 } 841 if (uninitialized || team_changed || tid_changed) { 842 thr_bar->team = team; 843 thr_bar->parent_bar = 844 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 845 retval = true; 846 } 847 if (uninitialized || team_sz_changed || tid_changed) { 848 thr_bar->nproc = nproc; 849 thr_bar->leaf_kids = thr_bar->base_leaf_kids; 850 if (thr_bar->my_level == 0) 851 thr_bar->leaf_kids = 0; 852 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc) 853 thr_bar->leaf_kids = nproc - tid - 1; 854 thr_bar->leaf_state = 0; 855 for (int i = 0; i < thr_bar->leaf_kids; ++i) 856 ((char *)&(thr_bar->leaf_state))[7 - i] = 1; 857 } 858 return retval; 859 } 860 861 static void __kmp_hierarchical_barrier_gather( 862 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 863 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 864 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather); 865 kmp_team_t *team = this_thr->th.th_team; 866 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 867 kmp_uint32 nproc = this_thr->th.th_team_nproc; 868 kmp_info_t **other_threads = team->t.t_threads; 869 kmp_uint64 new_state; 870 871 int level = team->t.t_level; 872 if (other_threads[0] 873 ->th.th_teams_microtask) // are we inside the teams construct? 874 if (this_thr->th.th_teams_size.nteams > 1) 875 ++level; // level was not increased in teams construct for team_of_masters 876 if (level == 1) 877 thr_bar->use_oncore_barrier = 1; 878 else 879 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 880 881 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for " 882 "barrier type %d\n", 883 gtid, team->t.t_id, tid, bt)); 884 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 885 886 #if USE_ITT_BUILD && USE_ITT_NOTIFY 887 // Barrier imbalance - save arrive time to the thread 888 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 889 this_thr->th.th_bar_arrive_time = __itt_get_timestamp(); 890 } 891 #endif 892 893 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, 894 team); 895 896 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf) 897 kmp_int32 child_tid; 898 new_state = 899 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 900 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 901 thr_bar->use_oncore_barrier) { 902 if (thr_bar->leaf_kids) { 903 // First, wait for leaf children to check-in on my b_arrived flag 904 kmp_uint64 leaf_state = 905 KMP_MASTER_TID(tid) 906 ? thr_bar->b_arrived | thr_bar->leaf_state 907 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state; 908 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting " 909 "for leaf kids\n", 910 gtid, team->t.t_id, tid)); 911 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state); 912 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 913 if (reduce) { 914 ANNOTATE_REDUCE_AFTER(reduce); 915 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids; 916 ++child_tid) { 917 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 918 "T#%d(%d:%d)\n", 919 gtid, team->t.t_id, tid, 920 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 921 child_tid)); 922 ANNOTATE_BARRIER_END(other_threads[child_tid]); 923 (*reduce)(this_thr->th.th_local.reduce_data, 924 other_threads[child_tid]->th.th_local.reduce_data); 925 } 926 ANNOTATE_REDUCE_BEFORE(reduce); 927 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 928 } 929 // clear leaf_state bits 930 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state)); 931 } 932 // Next, wait for higher level children on each child's b_arrived flag 933 for (kmp_uint32 d = 1; d < thr_bar->my_level; 934 ++d) { // gather lowest level threads first, but skip 0 935 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], 936 skip = thr_bar->skip_per_level[d]; 937 if (last > nproc) 938 last = nproc; 939 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 940 kmp_info_t *child_thr = other_threads[child_tid]; 941 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 942 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 943 "T#%d(%d:%d) " 944 "arrived(%p) == %llu\n", 945 gtid, team->t.t_id, tid, 946 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 947 child_tid, &child_bar->b_arrived, new_state)); 948 kmp_flag_64 flag(&child_bar->b_arrived, new_state); 949 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 950 ANNOTATE_BARRIER_END(child_thr); 951 if (reduce) { 952 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 953 "T#%d(%d:%d)\n", 954 gtid, team->t.t_id, tid, 955 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 956 child_tid)); 957 ANNOTATE_REDUCE_AFTER(reduce); 958 (*reduce)(this_thr->th.th_local.reduce_data, 959 child_thr->th.th_local.reduce_data); 960 ANNOTATE_REDUCE_BEFORE(reduce); 961 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 962 } 963 } 964 } 965 } else { // Blocktime is not infinite 966 for (kmp_uint32 d = 0; d < thr_bar->my_level; 967 ++d) { // Gather lowest level threads first 968 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], 969 skip = thr_bar->skip_per_level[d]; 970 if (last > nproc) 971 last = nproc; 972 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 973 kmp_info_t *child_thr = other_threads[child_tid]; 974 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 975 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 976 "T#%d(%d:%d) " 977 "arrived(%p) == %llu\n", 978 gtid, team->t.t_id, tid, 979 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 980 child_tid, &child_bar->b_arrived, new_state)); 981 kmp_flag_64 flag(&child_bar->b_arrived, new_state); 982 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 983 ANNOTATE_BARRIER_END(child_thr); 984 if (reduce) { 985 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 986 "T#%d(%d:%d)\n", 987 gtid, team->t.t_id, tid, 988 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 989 child_tid)); 990 ANNOTATE_REDUCE_AFTER(reduce); 991 (*reduce)(this_thr->th.th_local.reduce_data, 992 child_thr->th.th_local.reduce_data); 993 ANNOTATE_REDUCE_BEFORE(reduce); 994 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); 995 } 996 } 997 } 998 } 999 } 1000 // All subordinates are gathered; now release parent if not master thread 1001 1002 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy 1003 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing" 1004 " T#%d(%d:%d) arrived(%p): %llu => %llu\n", 1005 gtid, team->t.t_id, tid, 1006 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, 1007 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, 1008 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 1009 /* Mark arrival to parent: After performing this write, a worker thread may 1010 not assume that the team is valid any more - it could be deallocated by 1011 the master thread at any time. */ 1012 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || 1013 !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived 1014 // flag; release it 1015 ANNOTATE_BARRIER_BEGIN(this_thr); 1016 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]); 1017 flag.release(); 1018 } else { 1019 // Leaf does special release on "offset" bits of parent's b_arrived flag 1020 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 1021 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset); 1022 flag.set_waiter(other_threads[thr_bar->parent_tid]); 1023 flag.release(); 1024 } 1025 } else { // Master thread needs to update the team's b_arrived value 1026 team->t.t_bar[bt].b_arrived = new_state; 1027 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d " 1028 "arrived(%p) = %llu\n", 1029 gtid, team->t.t_id, tid, team->t.t_id, 1030 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 1031 } 1032 // Is the team access below unsafe or just technically invalid? 1033 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for " 1034 "barrier type %d\n", 1035 gtid, team->t.t_id, tid, bt)); 1036 } 1037 1038 static void __kmp_hierarchical_barrier_release( 1039 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 1040 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 1041 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release); 1042 kmp_team_t *team; 1043 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 1044 kmp_uint32 nproc; 1045 bool team_change = false; // indicates on-core barrier shouldn't be used 1046 1047 if (KMP_MASTER_TID(tid)) { 1048 team = __kmp_threads[gtid]->th.th_team; 1049 KMP_DEBUG_ASSERT(team != NULL); 1050 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master " 1051 "entered barrier type %d\n", 1052 gtid, team->t.t_id, tid, bt)); 1053 } else { // Worker threads 1054 // Wait for parent thread to release me 1055 if (!thr_bar->use_oncore_barrier || 1056 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 || 1057 thr_bar->team == NULL) { 1058 // Use traditional method of waiting on my own b_go flag 1059 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG; 1060 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 1061 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1062 ANNOTATE_BARRIER_END(this_thr); 1063 TCW_8(thr_bar->b_go, 1064 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1065 } else { // Thread barrier data is initialized, this is a leaf, blocktime is 1066 // infinite, not nested 1067 // Wait on my "offset" bits on parent's b_go flag 1068 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG; 1069 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, 1070 thr_bar->offset, bt, 1071 this_thr USE_ITT_BUILD_ARG(itt_sync_obj)); 1072 flag.wait(this_thr, TRUE); 1073 if (thr_bar->wait_flag == 1074 KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go 1075 TCW_8(thr_bar->b_go, 1076 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1077 } else { // Reset my bits on parent's b_go flag 1078 (RCAST(volatile char *, 1079 &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0; 1080 } 1081 } 1082 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 1083 // Early exit for reaping threads releasing forkjoin barrier 1084 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 1085 return; 1086 // The worker thread may now assume that the team is valid. 1087 team = __kmp_threads[gtid]->th.th_team; 1088 KMP_DEBUG_ASSERT(team != NULL); 1089 tid = __kmp_tid_from_gtid(gtid); 1090 1091 KA_TRACE( 1092 20, 1093 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 1094 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 1095 KMP_MB(); // Flush all pending memory write invalidates. 1096 } 1097 1098 nproc = this_thr->th.th_team_nproc; 1099 int level = team->t.t_level; 1100 if (team->t.t_threads[0] 1101 ->th.th_teams_microtask) { // are we inside the teams construct? 1102 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 1103 this_thr->th.th_teams_level == level) 1104 ++level; // level was not increased in teams construct for team_of_workers 1105 if (this_thr->th.th_teams_size.nteams > 1) 1106 ++level; // level was not increased in teams construct for team_of_masters 1107 } 1108 if (level == 1) 1109 thr_bar->use_oncore_barrier = 1; 1110 else 1111 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 1112 1113 // If the team size has increased, we still communicate with old leaves via 1114 // oncore barrier. 1115 unsigned short int old_leaf_kids = thr_bar->leaf_kids; 1116 kmp_uint64 old_leaf_state = thr_bar->leaf_state; 1117 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, 1118 tid, team); 1119 // But if the entire team changes, we won't use oncore barrier at all 1120 if (team_change) 1121 old_leaf_kids = 0; 1122 1123 #if KMP_BARRIER_ICV_PUSH 1124 if (propagate_icvs) { 1125 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, 1126 FALSE); 1127 if (KMP_MASTER_TID( 1128 tid)) { // master already has copy in final destination; copy 1129 copy_icvs(&thr_bar->th_fixed_icvs, 1130 &team->t.t_implicit_task_taskdata[tid].td_icvs); 1131 } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 1132 thr_bar->use_oncore_barrier) { // optimization for inf blocktime 1133 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0) 1134 // leaves (on-core children) pull parent's fixed ICVs directly to local 1135 // ICV store 1136 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1137 &thr_bar->parent_bar->th_fixed_icvs); 1138 // non-leaves will get ICVs piggybacked with b_go via NGO store 1139 } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs 1140 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can 1141 // access 1142 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs); 1143 else // leaves copy parent's fixed ICVs directly to local ICV store 1144 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1145 &thr_bar->parent_bar->th_fixed_icvs); 1146 } 1147 } 1148 #endif // KMP_BARRIER_ICV_PUSH 1149 1150 // Now, release my children 1151 if (thr_bar->my_level) { // not a leaf 1152 kmp_int32 child_tid; 1153 kmp_uint32 last; 1154 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 1155 thr_bar->use_oncore_barrier) { 1156 if (KMP_MASTER_TID(tid)) { // do a flat release 1157 // Set local b_go to bump children via NGO store of the cache line 1158 // containing IVCs and b_go. 1159 thr_bar->b_go = KMP_BARRIER_STATE_BUMP; 1160 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of 1161 // the cache line 1162 ngo_load(&thr_bar->th_fixed_icvs); 1163 // This loops over all the threads skipping only the leaf nodes in the 1164 // hierarchy 1165 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc; 1166 child_tid += thr_bar->skip_per_level[1]) { 1167 kmp_bstate_t *child_bar = 1168 &team->t.t_threads[child_tid]->th.th_bar[bt].bb; 1169 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 1170 "releasing T#%d(%d:%d)" 1171 " go(%p): %u => %u\n", 1172 gtid, team->t.t_id, tid, 1173 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1174 child_tid, &child_bar->b_go, child_bar->b_go, 1175 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1176 // Use ngo store (if available) to both store ICVs and release child 1177 // via child's b_go 1178 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 1179 } 1180 ngo_sync(); 1181 } 1182 TCW_8(thr_bar->b_go, 1183 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1184 // Now, release leaf children 1185 if (thr_bar->leaf_kids) { // if there are any 1186 // We test team_change on the off-chance that the level 1 team changed. 1187 if (team_change || 1188 old_leaf_kids < thr_bar->leaf_kids) { // some old, some new 1189 if (old_leaf_kids) { // release old leaf kids 1190 thr_bar->b_go |= old_leaf_state; 1191 } 1192 // Release new leaf kids 1193 last = tid + thr_bar->skip_per_level[1]; 1194 if (last > nproc) 1195 last = nproc; 1196 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last; 1197 ++child_tid) { // skip_per_level[0]=1 1198 kmp_info_t *child_thr = team->t.t_threads[child_tid]; 1199 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1200 KA_TRACE( 1201 20, 1202 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing" 1203 " T#%d(%d:%d) go(%p): %u => %u\n", 1204 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1205 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 1206 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1207 // Release child using child's b_go flag 1208 ANNOTATE_BARRIER_BEGIN(child_thr); 1209 kmp_flag_64 flag(&child_bar->b_go, child_thr); 1210 flag.release(); 1211 } 1212 } else { // Release all children at once with leaf_state bits on my own 1213 // b_go flag 1214 thr_bar->b_go |= thr_bar->leaf_state; 1215 } 1216 } 1217 } else { // Blocktime is not infinite; do a simple hierarchical release 1218 for (int d = thr_bar->my_level - 1; d >= 0; 1219 --d) { // Release highest level threads first 1220 last = tid + thr_bar->skip_per_level[d + 1]; 1221 kmp_uint32 skip = thr_bar->skip_per_level[d]; 1222 if (last > nproc) 1223 last = nproc; 1224 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 1225 kmp_info_t *child_thr = team->t.t_threads[child_tid]; 1226 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1227 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 1228 "releasing T#%d(%d:%d) go(%p): %u => %u\n", 1229 gtid, team->t.t_id, tid, 1230 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1231 child_tid, &child_bar->b_go, child_bar->b_go, 1232 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1233 // Release child using child's b_go flag 1234 ANNOTATE_BARRIER_BEGIN(child_thr); 1235 kmp_flag_64 flag(&child_bar->b_go, child_thr); 1236 flag.release(); 1237 } 1238 } 1239 } 1240 #if KMP_BARRIER_ICV_PUSH 1241 if (propagate_icvs && !KMP_MASTER_TID(tid)) 1242 // non-leaves copy ICVs from fixed ICVs to local dest 1243 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1244 &thr_bar->th_fixed_icvs); 1245 #endif // KMP_BARRIER_ICV_PUSH 1246 } 1247 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for " 1248 "barrier type %d\n", 1249 gtid, team->t.t_id, tid, bt)); 1250 } 1251 1252 // End of Barrier Algorithms 1253 1254 // type traits for cancellable value 1255 // if cancellable is true, then is_cancellable is a normal boolean variable 1256 // if cancellable is false, then is_cancellable is a compile time constant 1257 template <bool cancellable> struct is_cancellable {}; 1258 template <> struct is_cancellable<true> { 1259 bool value; 1260 is_cancellable() : value(false) {} 1261 is_cancellable(bool b) : value(b) {} 1262 is_cancellable &operator=(bool b) { 1263 value = b; 1264 return *this; 1265 } 1266 operator bool() const { return value; } 1267 }; 1268 template <> struct is_cancellable<false> { 1269 is_cancellable &operator=(bool b) { return *this; } 1270 constexpr operator bool() const { return false; } 1271 }; 1272 1273 // Internal function to do a barrier. 1274 /* If is_split is true, do a split barrier, otherwise, do a plain barrier 1275 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split 1276 barrier 1277 When cancellable = false, 1278 Returns 0 if master thread, 1 if worker thread. 1279 When cancellable = true 1280 Returns 0 if not cancelled, 1 if cancelled. */ 1281 template <bool cancellable = false> 1282 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split, 1283 size_t reduce_size, void *reduce_data, 1284 void (*reduce)(void *, void *)) { 1285 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier); 1286 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); 1287 int tid = __kmp_tid_from_gtid(gtid); 1288 kmp_info_t *this_thr = __kmp_threads[gtid]; 1289 kmp_team_t *team = this_thr->th.th_team; 1290 int status = 0; 1291 is_cancellable<cancellable> cancelled; 1292 #if OMPT_SUPPORT && OMPT_OPTIONAL 1293 ompt_data_t *my_task_data; 1294 ompt_data_t *my_parallel_data; 1295 void *return_address; 1296 ompt_sync_region_t barrier_kind; 1297 #endif 1298 1299 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid, 1300 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1301 1302 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar); 1303 #if OMPT_SUPPORT 1304 if (ompt_enabled.enabled) { 1305 #if OMPT_OPTIONAL 1306 my_task_data = OMPT_CUR_TASK_DATA(this_thr); 1307 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); 1308 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1309 barrier_kind = __ompt_get_barrier_kind(bt, this_thr); 1310 if (ompt_enabled.ompt_callback_sync_region) { 1311 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1312 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data, 1313 return_address); 1314 } 1315 if (ompt_enabled.ompt_callback_sync_region_wait) { 1316 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1317 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data, 1318 return_address); 1319 } 1320 #endif 1321 // It is OK to report the barrier state after the barrier begin callback. 1322 // According to the OMPT specification, a compliant implementation may 1323 // even delay reporting this state until the barrier begins to wait. 1324 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; 1325 } 1326 #endif 1327 1328 if (!team->t.t_serialized) { 1329 #if USE_ITT_BUILD 1330 // This value will be used in itt notify events below. 1331 void *itt_sync_obj = NULL; 1332 #if USE_ITT_NOTIFY 1333 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1334 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 1335 #endif 1336 #endif /* USE_ITT_BUILD */ 1337 if (__kmp_tasking_mode == tskm_extra_barrier) { 1338 __kmp_tasking_barrier(team, this_thr, gtid); 1339 KA_TRACE(15, 1340 ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid, 1341 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1342 } 1343 1344 /* Copy the blocktime info to the thread, where __kmp_wait_template() can 1345 access it when the team struct is not guaranteed to exist. */ 1346 // See note about the corresponding code in __kmp_join_barrier() being 1347 // performance-critical. 1348 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1349 #if KMP_USE_MONITOR 1350 this_thr->th.th_team_bt_intervals = 1351 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1352 this_thr->th.th_team_bt_set = 1353 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1354 #else 1355 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 1356 #endif 1357 } 1358 1359 #if USE_ITT_BUILD 1360 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1361 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1362 #endif /* USE_ITT_BUILD */ 1363 #if USE_DEBUGGER 1364 // Let the debugger know: the thread arrived to the barrier and waiting. 1365 if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure. 1366 team->t.t_bar[bt].b_master_arrived += 1; 1367 } else { 1368 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1; 1369 } // if 1370 #endif /* USE_DEBUGGER */ 1371 if (reduce != NULL) { 1372 // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956 1373 this_thr->th.th_local.reduce_data = reduce_data; 1374 } 1375 1376 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec) 1377 // use 0 to only setup the current team if nthreads > 1 1378 __kmp_task_team_setup(this_thr, team, 0); 1379 1380 if (cancellable) { 1381 cancelled = __kmp_linear_barrier_gather_cancellable( 1382 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1383 } else { 1384 switch (__kmp_barrier_gather_pattern[bt]) { 1385 case bp_hyper_bar: { 1386 // don't set branch bits to 0; use linear 1387 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); 1388 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, 1389 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1390 break; 1391 } 1392 case bp_hierarchical_bar: { 1393 __kmp_hierarchical_barrier_gather( 1394 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1395 break; 1396 } 1397 case bp_tree_bar: { 1398 // don't set branch bits to 0; use linear 1399 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); 1400 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, 1401 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1402 break; 1403 } 1404 default: { 1405 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, 1406 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1407 } 1408 } 1409 } 1410 1411 KMP_MB(); 1412 1413 if (KMP_MASTER_TID(tid)) { 1414 status = 0; 1415 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) { 1416 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 1417 } 1418 #if USE_DEBUGGER 1419 // Let the debugger know: All threads are arrived and starting leaving the 1420 // barrier. 1421 team->t.t_bar[bt].b_team_arrived += 1; 1422 #endif 1423 1424 if (__kmp_omp_cancellation) { 1425 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request); 1426 // Reset cancellation flag for worksharing constructs 1427 if (cancel_request == cancel_loop || 1428 cancel_request == cancel_sections) { 1429 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq); 1430 } 1431 } 1432 #if USE_ITT_BUILD 1433 /* TODO: In case of split reduction barrier, master thread may send 1434 acquired event early, before the final summation into the shared 1435 variable is done (final summation can be a long operation for array 1436 reductions). */ 1437 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1438 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1439 #endif /* USE_ITT_BUILD */ 1440 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1441 // Barrier - report frame end (only if active_level == 1) 1442 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 1443 __kmp_forkjoin_frames_mode && 1444 this_thr->th.th_teams_microtask == NULL && 1445 team->t.t_active_level == 1) { 1446 ident_t *loc = __kmp_threads[gtid]->th.th_ident; 1447 kmp_uint64 cur_time = __itt_get_timestamp(); 1448 kmp_info_t **other_threads = team->t.t_threads; 1449 int nproc = this_thr->th.th_team_nproc; 1450 int i; 1451 switch (__kmp_forkjoin_frames_mode) { 1452 case 1: 1453 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1454 loc, nproc); 1455 this_thr->th.th_frame_time = cur_time; 1456 break; 1457 case 2: // AC 2015-01-19: currently does not work for hierarchical (to 1458 // be fixed) 1459 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1460 1, loc, nproc); 1461 break; 1462 case 3: 1463 if (__itt_metadata_add_ptr) { 1464 // Initialize with master's wait time 1465 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 1466 // Set arrive time to zero to be able to check it in 1467 // __kmp_invoke_task(); the same is done inside the loop below 1468 this_thr->th.th_bar_arrive_time = 0; 1469 for (i = 1; i < nproc; ++i) { 1470 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); 1471 other_threads[i]->th.th_bar_arrive_time = 0; 1472 } 1473 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, 1474 cur_time, delta, 1475 (kmp_uint64)(reduce != NULL)); 1476 } 1477 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1478 loc, nproc); 1479 this_thr->th.th_frame_time = cur_time; 1480 break; 1481 } 1482 } 1483 #endif /* USE_ITT_BUILD */ 1484 } else { 1485 status = 1; 1486 #if USE_ITT_BUILD 1487 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1488 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1489 #endif /* USE_ITT_BUILD */ 1490 } 1491 if ((status == 1 || !is_split) && !cancelled) { 1492 if (cancellable) { 1493 cancelled = __kmp_linear_barrier_release_cancellable( 1494 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1495 } else { 1496 switch (__kmp_barrier_release_pattern[bt]) { 1497 case bp_hyper_bar: { 1498 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1499 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, 1500 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1501 break; 1502 } 1503 case bp_hierarchical_bar: { 1504 __kmp_hierarchical_barrier_release( 1505 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1506 break; 1507 } 1508 case bp_tree_bar: { 1509 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1510 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, 1511 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1512 break; 1513 } 1514 default: { 1515 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, 1516 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1517 } 1518 } 1519 } 1520 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) { 1521 __kmp_task_team_sync(this_thr, team); 1522 } 1523 } 1524 1525 #if USE_ITT_BUILD 1526 /* GEH: TODO: Move this under if-condition above and also include in 1527 __kmp_end_split_barrier(). This will more accurately represent the actual 1528 release time of the threads for split barriers. */ 1529 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1530 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 1531 #endif /* USE_ITT_BUILD */ 1532 } else { // Team is serialized. 1533 status = 0; 1534 if (__kmp_tasking_mode != tskm_immediate_exec) { 1535 if (this_thr->th.th_task_team != NULL) { 1536 #if USE_ITT_NOTIFY 1537 void *itt_sync_obj = NULL; 1538 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1539 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 1540 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1541 } 1542 #endif 1543 1544 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == 1545 TRUE); 1546 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 1547 __kmp_task_team_setup(this_thr, team, 0); 1548 1549 #if USE_ITT_BUILD 1550 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1551 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 1552 #endif /* USE_ITT_BUILD */ 1553 } 1554 } 1555 } 1556 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n", 1557 gtid, __kmp_team_from_gtid(gtid)->t.t_id, 1558 __kmp_tid_from_gtid(gtid), status)); 1559 1560 #if OMPT_SUPPORT 1561 if (ompt_enabled.enabled) { 1562 #if OMPT_OPTIONAL 1563 if (ompt_enabled.ompt_callback_sync_region_wait) { 1564 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1565 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data, 1566 return_address); 1567 } 1568 if (ompt_enabled.ompt_callback_sync_region) { 1569 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1570 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data, 1571 return_address); 1572 } 1573 #endif 1574 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1575 } 1576 #endif 1577 ANNOTATE_BARRIER_END(&team->t.t_bar); 1578 1579 if (cancellable) 1580 return (int)cancelled; 1581 return status; 1582 } 1583 1584 // Returns 0 if master thread, 1 if worker thread. 1585 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split, 1586 size_t reduce_size, void *reduce_data, 1587 void (*reduce)(void *, void *)) { 1588 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data, 1589 reduce); 1590 } 1591 1592 #if defined(KMP_GOMP_COMPAT) 1593 // Returns 1 if cancelled, 0 otherwise 1594 int __kmp_barrier_gomp_cancel(int gtid) { 1595 if (__kmp_omp_cancellation) { 1596 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE, 1597 0, NULL, NULL); 1598 if (cancelled) { 1599 int tid = __kmp_tid_from_gtid(gtid); 1600 kmp_info_t *this_thr = __kmp_threads[gtid]; 1601 if (KMP_MASTER_TID(tid)) { 1602 // Master does not need to revert anything 1603 } else { 1604 // Workers need to revert their private b_arrived flag 1605 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -= 1606 KMP_BARRIER_STATE_BUMP; 1607 } 1608 } 1609 return cancelled; 1610 } 1611 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 1612 return FALSE; 1613 } 1614 #endif 1615 1616 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) { 1617 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier); 1618 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); 1619 int tid = __kmp_tid_from_gtid(gtid); 1620 kmp_info_t *this_thr = __kmp_threads[gtid]; 1621 kmp_team_t *team = this_thr->th.th_team; 1622 1623 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar); 1624 if (!team->t.t_serialized) { 1625 if (KMP_MASTER_GTID(gtid)) { 1626 switch (__kmp_barrier_release_pattern[bt]) { 1627 case bp_hyper_bar: { 1628 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1629 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, 1630 FALSE USE_ITT_BUILD_ARG(NULL)); 1631 break; 1632 } 1633 case bp_hierarchical_bar: { 1634 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, 1635 FALSE USE_ITT_BUILD_ARG(NULL)); 1636 break; 1637 } 1638 case bp_tree_bar: { 1639 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 1640 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, 1641 FALSE USE_ITT_BUILD_ARG(NULL)); 1642 break; 1643 } 1644 default: { 1645 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, 1646 FALSE USE_ITT_BUILD_ARG(NULL)); 1647 } 1648 } 1649 if (__kmp_tasking_mode != tskm_immediate_exec) { 1650 __kmp_task_team_sync(this_thr, team); 1651 } // if 1652 } 1653 } 1654 ANNOTATE_BARRIER_END(&team->t.t_bar); 1655 } 1656 1657 void __kmp_join_barrier(int gtid) { 1658 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier); 1659 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 1660 kmp_info_t *this_thr = __kmp_threads[gtid]; 1661 kmp_team_t *team; 1662 kmp_uint nproc; 1663 kmp_info_t *master_thread; 1664 int tid; 1665 #ifdef KMP_DEBUG 1666 int team_id; 1667 #endif /* KMP_DEBUG */ 1668 #if USE_ITT_BUILD 1669 void *itt_sync_obj = NULL; 1670 #if USE_ITT_NOTIFY 1671 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need 1672 // Get object created at fork_barrier 1673 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 1674 #endif 1675 #endif /* USE_ITT_BUILD */ 1676 KMP_MB(); 1677 1678 // Get current info 1679 team = this_thr->th.th_team; 1680 nproc = this_thr->th.th_team_nproc; 1681 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc); 1682 tid = __kmp_tid_from_gtid(gtid); 1683 #ifdef KMP_DEBUG 1684 team_id = team->t.t_id; 1685 #endif /* KMP_DEBUG */ 1686 master_thread = this_thr->th.th_team_master; 1687 #ifdef KMP_DEBUG 1688 if (master_thread != team->t.t_threads[0]) { 1689 __kmp_print_structure(); 1690 } 1691 #endif /* KMP_DEBUG */ 1692 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]); 1693 KMP_MB(); 1694 1695 // Verify state 1696 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 1697 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team)); 1698 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root)); 1699 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]); 1700 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", 1701 gtid, team_id, tid)); 1702 1703 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar); 1704 #if OMPT_SUPPORT 1705 if (ompt_enabled.enabled) { 1706 #if OMPT_OPTIONAL 1707 ompt_data_t *my_task_data; 1708 ompt_data_t *my_parallel_data; 1709 void *codeptr = NULL; 1710 int ds_tid = this_thr->th.th_info.ds.ds_tid; 1711 if (KMP_MASTER_TID(ds_tid) && 1712 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 1713 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 1714 codeptr = team->t.ompt_team_info.master_return_address; 1715 my_task_data = OMPT_CUR_TASK_DATA(this_thr); 1716 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); 1717 if (ompt_enabled.ompt_callback_sync_region) { 1718 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1719 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data, 1720 my_task_data, codeptr); 1721 } 1722 if (ompt_enabled.ompt_callback_sync_region_wait) { 1723 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1724 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data, 1725 my_task_data, codeptr); 1726 } 1727 if (!KMP_MASTER_TID(ds_tid)) 1728 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr); 1729 #endif 1730 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit; 1731 } 1732 #endif 1733 1734 if (__kmp_tasking_mode == tskm_extra_barrier) { 1735 __kmp_tasking_barrier(team, this_thr, gtid); 1736 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, 1737 team_id, tid)); 1738 } 1739 #ifdef KMP_DEBUG 1740 if (__kmp_tasking_mode != tskm_immediate_exec) { 1741 KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = " 1742 "%p, th_task_team = %p\n", 1743 __kmp_gtid_from_thread(this_thr), team_id, 1744 team->t.t_task_team[this_thr->th.th_task_state], 1745 this_thr->th.th_task_team)); 1746 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == 1747 team->t.t_task_team[this_thr->th.th_task_state]); 1748 } 1749 #endif /* KMP_DEBUG */ 1750 1751 /* Copy the blocktime info to the thread, where __kmp_wait_template() can 1752 access it when the team struct is not guaranteed to exist. Doing these 1753 loads causes a cache miss slows down EPCC parallel by 2x. As a workaround, 1754 we do not perform the copy if blocktime=infinite, since the values are not 1755 used by __kmp_wait_template() in that case. */ 1756 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1757 #if KMP_USE_MONITOR 1758 this_thr->th.th_team_bt_intervals = 1759 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1760 this_thr->th.th_team_bt_set = 1761 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1762 #else 1763 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 1764 #endif 1765 } 1766 1767 #if USE_ITT_BUILD 1768 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1769 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1770 #endif /* USE_ITT_BUILD */ 1771 1772 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) { 1773 case bp_hyper_bar: { 1774 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 1775 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 1776 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 1777 break; 1778 } 1779 case bp_hierarchical_bar: { 1780 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 1781 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 1782 break; 1783 } 1784 case bp_tree_bar: { 1785 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 1786 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 1787 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 1788 break; 1789 } 1790 default: { 1791 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 1792 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 1793 } 1794 } 1795 1796 /* From this point on, the team data structure may be deallocated at any time 1797 by the master thread - it is unsafe to reference it in any of the worker 1798 threads. Any per-team data items that need to be referenced before the 1799 end of the barrier should be moved to the kmp_task_team_t structs. */ 1800 if (KMP_MASTER_TID(tid)) { 1801 if (__kmp_tasking_mode != tskm_immediate_exec) { 1802 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 1803 } 1804 if (__kmp_display_affinity) { 1805 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0); 1806 } 1807 #if KMP_STATS_ENABLED 1808 // Have master thread flag the workers to indicate they are now waiting for 1809 // next parallel region, Also wake them up so they switch their timers to 1810 // idle. 1811 for (int i = 0; i < team->t.t_nproc; ++i) { 1812 kmp_info_t *team_thread = team->t.t_threads[i]; 1813 if (team_thread == this_thr) 1814 continue; 1815 team_thread->th.th_stats->setIdleFlag(); 1816 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && 1817 team_thread->th.th_sleep_loc != NULL) 1818 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread), 1819 team_thread->th.th_sleep_loc); 1820 } 1821 #endif 1822 #if USE_ITT_BUILD 1823 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1824 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1825 #endif /* USE_ITT_BUILD */ 1826 1827 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1828 // Join barrier - report frame end 1829 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 1830 __kmp_forkjoin_frames_mode && this_thr->th.th_teams_microtask == NULL && 1831 team->t.t_active_level == 1) { 1832 kmp_uint64 cur_time = __itt_get_timestamp(); 1833 ident_t *loc = team->t.t_ident; 1834 kmp_info_t **other_threads = team->t.t_threads; 1835 int nproc = this_thr->th.th_team_nproc; 1836 int i; 1837 switch (__kmp_forkjoin_frames_mode) { 1838 case 1: 1839 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1840 loc, nproc); 1841 break; 1842 case 2: 1843 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, 1844 loc, nproc); 1845 break; 1846 case 3: 1847 if (__itt_metadata_add_ptr) { 1848 // Initialize with master's wait time 1849 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 1850 // Set arrive time to zero to be able to check it in 1851 // __kmp_invoke_task(); the same is done inside the loop below 1852 this_thr->th.th_bar_arrive_time = 0; 1853 for (i = 1; i < nproc; ++i) { 1854 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); 1855 other_threads[i]->th.th_bar_arrive_time = 0; 1856 } 1857 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, 1858 cur_time, delta, 0); 1859 } 1860 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1861 loc, nproc); 1862 this_thr->th.th_frame_time = cur_time; 1863 break; 1864 } 1865 } 1866 #endif /* USE_ITT_BUILD */ 1867 } 1868 #if USE_ITT_BUILD 1869 else { 1870 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1871 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1872 } 1873 #endif /* USE_ITT_BUILD */ 1874 1875 #if KMP_DEBUG 1876 if (KMP_MASTER_TID(tid)) { 1877 KA_TRACE( 1878 15, 1879 ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n", 1880 gtid, team_id, tid, nproc)); 1881 } 1882 #endif /* KMP_DEBUG */ 1883 1884 // TODO now, mark worker threads as done so they may be disbanded 1885 KMP_MB(); // Flush all pending memory write invalidates. 1886 KA_TRACE(10, 1887 ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid)); 1888 1889 ANNOTATE_BARRIER_END(&team->t.t_bar); 1890 } 1891 1892 // TODO release worker threads' fork barriers as we are ready instead of all at 1893 // once 1894 void __kmp_fork_barrier(int gtid, int tid) { 1895 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier); 1896 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 1897 kmp_info_t *this_thr = __kmp_threads[gtid]; 1898 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL; 1899 #if USE_ITT_BUILD 1900 void *itt_sync_obj = NULL; 1901 #endif /* USE_ITT_BUILD */ 1902 if (team) 1903 ANNOTATE_BARRIER_END(&team->t.t_bar); 1904 1905 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid, 1906 (team != NULL) ? team->t.t_id : -1, tid)); 1907 1908 // th_team pointer only valid for master thread here 1909 if (KMP_MASTER_TID(tid)) { 1910 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1911 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 1912 // Create itt barrier object 1913 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1); 1914 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing 1915 } 1916 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1917 1918 #ifdef KMP_DEBUG 1919 kmp_info_t **other_threads = team->t.t_threads; 1920 int i; 1921 1922 // Verify state 1923 KMP_MB(); 1924 1925 for (i = 1; i < team->t.t_nproc; ++i) { 1926 KA_TRACE(500, 1927 ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go " 1928 "== %u.\n", 1929 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid, 1930 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid, 1931 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)); 1932 KMP_DEBUG_ASSERT( 1933 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) & 1934 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE); 1935 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team); 1936 } 1937 #endif 1938 1939 if (__kmp_tasking_mode != tskm_immediate_exec) { 1940 // 0 indicates setup current task team if nthreads > 1 1941 __kmp_task_team_setup(this_thr, team, 0); 1942 } 1943 1944 /* The master thread may have changed its blocktime between the join barrier 1945 and the fork barrier. Copy the blocktime info to the thread, where 1946 __kmp_wait_template() can access it when the team struct is not 1947 guaranteed to exist. */ 1948 // See note about the corresponding code in __kmp_join_barrier() being 1949 // performance-critical 1950 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1951 #if KMP_USE_MONITOR 1952 this_thr->th.th_team_bt_intervals = 1953 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1954 this_thr->th.th_team_bt_set = 1955 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1956 #else 1957 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 1958 #endif 1959 } 1960 } // master 1961 1962 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) { 1963 case bp_hyper_bar: { 1964 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 1965 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 1966 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1967 break; 1968 } 1969 case bp_hierarchical_bar: { 1970 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 1971 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1972 break; 1973 } 1974 case bp_tree_bar: { 1975 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 1976 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 1977 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1978 break; 1979 } 1980 default: { 1981 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 1982 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1983 } 1984 } 1985 1986 #if OMPT_SUPPORT 1987 if (ompt_enabled.enabled && 1988 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 1989 int ds_tid = this_thr->th.th_info.ds.ds_tid; 1990 ompt_data_t *task_data = (team) 1991 ? OMPT_CUR_TASK_DATA(this_thr) 1992 : &(this_thr->th.ompt_thread_info.task_data); 1993 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 1994 #if OMPT_OPTIONAL 1995 void *codeptr = NULL; 1996 if (KMP_MASTER_TID(ds_tid) && 1997 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 1998 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 1999 codeptr = team->t.ompt_team_info.master_return_address; 2000 if (ompt_enabled.ompt_callback_sync_region_wait) { 2001 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2002 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 2003 codeptr); 2004 } 2005 if (ompt_enabled.ompt_callback_sync_region) { 2006 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2007 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 2008 codeptr); 2009 } 2010 #endif 2011 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 2012 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2013 ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial? 2014 } 2015 } 2016 #endif 2017 2018 // Early exit for reaping threads releasing forkjoin barrier 2019 if (TCR_4(__kmp_global.g.g_done)) { 2020 this_thr->th.th_task_team = NULL; 2021 2022 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2023 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2024 if (!KMP_MASTER_TID(tid)) { 2025 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2026 if (itt_sync_obj) 2027 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 2028 } 2029 } 2030 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2031 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid)); 2032 return; 2033 } 2034 2035 /* We can now assume that a valid team structure has been allocated by the 2036 master and propagated to all worker threads. The current thread, however, 2037 may not be part of the team, so we can't blindly assume that the team 2038 pointer is non-null. */ 2039 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team); 2040 KMP_DEBUG_ASSERT(team != NULL); 2041 tid = __kmp_tid_from_gtid(gtid); 2042 2043 #if KMP_BARRIER_ICV_PULL 2044 /* Master thread's copy of the ICVs was set up on the implicit taskdata in 2045 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's 2046 implicit task has this data before this function is called. We cannot 2047 modify __kmp_fork_call() to look at the fixed ICVs in the master's thread 2048 struct, because it is not always the case that the threads arrays have 2049 been allocated when __kmp_fork_call() is executed. */ 2050 { 2051 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 2052 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs 2053 // Copy the initial ICVs from the master's thread struct to the implicit 2054 // task for this tid. 2055 KA_TRACE(10, 2056 ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid)); 2057 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, 2058 tid, FALSE); 2059 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 2060 &team->t.t_threads[0] 2061 ->th.th_bar[bs_forkjoin_barrier] 2062 .bb.th_fixed_icvs); 2063 } 2064 } 2065 #endif // KMP_BARRIER_ICV_PULL 2066 2067 if (__kmp_tasking_mode != tskm_immediate_exec) { 2068 __kmp_task_team_sync(this_thr, team); 2069 } 2070 2071 #if KMP_AFFINITY_SUPPORTED 2072 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 2073 if (proc_bind == proc_bind_intel) { 2074 // Call dynamic affinity settings 2075 if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) { 2076 __kmp_balanced_affinity(this_thr, team->t.t_nproc); 2077 } 2078 } else if (proc_bind != proc_bind_false) { 2079 if (this_thr->th.th_new_place == this_thr->th.th_current_place) { 2080 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n", 2081 __kmp_gtid_from_thread(this_thr), 2082 this_thr->th.th_current_place)); 2083 } else { 2084 __kmp_affinity_set_place(gtid); 2085 } 2086 } 2087 #endif // KMP_AFFINITY_SUPPORTED 2088 // Perform the display affinity functionality 2089 if (__kmp_display_affinity) { 2090 if (team->t.t_display_affinity 2091 #if KMP_AFFINITY_SUPPORTED 2092 || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) 2093 #endif 2094 ) { 2095 // NULL means use the affinity-format-var ICV 2096 __kmp_aux_display_affinity(gtid, NULL); 2097 this_thr->th.th_prev_num_threads = team->t.t_nproc; 2098 this_thr->th.th_prev_level = team->t.t_level; 2099 } 2100 } 2101 if (!KMP_MASTER_TID(tid)) 2102 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator); 2103 2104 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2105 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2106 if (!KMP_MASTER_TID(tid)) { 2107 // Get correct barrier object 2108 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2109 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired 2110 } // (prepare called inside barrier_release) 2111 } 2112 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2113 ANNOTATE_BARRIER_END(&team->t.t_bar); 2114 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, 2115 team->t.t_id, tid)); 2116 } 2117 2118 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 2119 kmp_internal_control_t *new_icvs, ident_t *loc) { 2120 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy); 2121 2122 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs); 2123 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 2124 2125 /* Master thread's copy of the ICVs was set up on the implicit taskdata in 2126 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's 2127 implicit task has this data before this function is called. */ 2128 #if KMP_BARRIER_ICV_PULL 2129 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains 2130 untouched), where all of the worker threads can access them and make their 2131 own copies after the barrier. */ 2132 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be 2133 // allocated at this point 2134 copy_icvs( 2135 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, 2136 new_icvs); 2137 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0, 2138 team->t.t_threads[0], team)); 2139 #elif KMP_BARRIER_ICV_PUSH 2140 // The ICVs will be propagated in the fork barrier, so nothing needs to be 2141 // done here. 2142 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0, 2143 team->t.t_threads[0], team)); 2144 #else 2145 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) 2146 // time. 2147 ngo_load(new_icvs); 2148 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be 2149 // allocated at this point 2150 for (int f = 1; f < new_nproc; ++f) { // Skip the master thread 2151 // TODO: GEH - pass in better source location info since usually NULL here 2152 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 2153 f, team->t.t_threads[f], team)); 2154 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE); 2155 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs); 2156 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 2157 f, team->t.t_threads[f], team)); 2158 } 2159 ngo_sync(); 2160 #endif // KMP_BARRIER_ICV_PULL 2161 } 2162