1 /* 2 * kmp_barrier.cpp 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp_wait_release.h" 14 #include "kmp_barrier.h" 15 #include "kmp_itt.h" 16 #include "kmp_os.h" 17 #include "kmp_stats.h" 18 #include "ompt-specific.h" 19 // for distributed barrier 20 #include "kmp_affinity.h" 21 22 #if KMP_MIC 23 #include <immintrin.h> 24 #define USE_NGO_STORES 1 25 #endif // KMP_MIC 26 27 #if KMP_MIC && USE_NGO_STORES 28 // ICV copying 29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src)) 30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) 32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory") 33 #else 34 #define ngo_load(src) ((void)0) 35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src)) 36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE) 37 #define ngo_sync() ((void)0) 38 #endif /* KMP_MIC && USE_NGO_STORES */ 39 40 void __kmp_print_structure(void); // Forward declaration 41 42 // ---------------------------- Barrier Algorithms ---------------------------- 43 // Distributed barrier 44 45 // Compute how many threads to have polling each cache-line. 46 // We want to limit the number of writes to IDEAL_GO_RESOLUTION. 47 void distributedBarrier::computeVarsForN(size_t n) { 48 int nsockets = 1; 49 if (__kmp_topology) { 50 int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET); 51 int core_level = __kmp_topology->get_level(KMP_HW_CORE); 52 int ncores_per_socket = 53 __kmp_topology->calculate_ratio(core_level, socket_level); 54 nsockets = __kmp_topology->get_count(socket_level); 55 56 if (nsockets <= 0) 57 nsockets = 1; 58 if (ncores_per_socket <= 0) 59 ncores_per_socket = 1; 60 61 threads_per_go = ncores_per_socket >> 1; 62 if (!fix_threads_per_go) { 63 // Minimize num_gos 64 if (threads_per_go > 4) { 65 if (KMP_OPTIMIZE_FOR_REDUCTIONS) { 66 threads_per_go = threads_per_go >> 1; 67 } 68 if (threads_per_go > 4 && nsockets == 1) 69 threads_per_go = threads_per_go >> 1; 70 } 71 } 72 if (threads_per_go == 0) 73 threads_per_go = 1; 74 fix_threads_per_go = true; 75 num_gos = n / threads_per_go; 76 if (n % threads_per_go) 77 num_gos++; 78 if (nsockets == 1 || num_gos == 1) 79 num_groups = 1; 80 else { 81 num_groups = num_gos / nsockets; 82 if (num_gos % nsockets) 83 num_groups++; 84 } 85 if (num_groups <= 0) 86 num_groups = 1; 87 gos_per_group = num_gos / num_groups; 88 if (num_gos % num_groups) 89 gos_per_group++; 90 threads_per_group = threads_per_go * gos_per_group; 91 } else { 92 num_gos = n / threads_per_go; 93 if (n % threads_per_go) 94 num_gos++; 95 if (num_gos == 1) 96 num_groups = 1; 97 else { 98 num_groups = num_gos / 2; 99 if (num_gos % 2) 100 num_groups++; 101 } 102 gos_per_group = num_gos / num_groups; 103 if (num_gos % num_groups) 104 gos_per_group++; 105 threads_per_group = threads_per_go * gos_per_group; 106 } 107 } 108 109 void distributedBarrier::computeGo(size_t n) { 110 // Minimize num_gos 111 for (num_gos = 1;; num_gos++) 112 if (IDEAL_CONTENTION * num_gos >= n) 113 break; 114 threads_per_go = n / num_gos; 115 if (n % num_gos) 116 threads_per_go++; 117 while (num_gos > MAX_GOS) { 118 threads_per_go++; 119 num_gos = n / threads_per_go; 120 if (n % threads_per_go) 121 num_gos++; 122 } 123 computeVarsForN(n); 124 } 125 126 // This function is to resize the barrier arrays when the new number of threads 127 // exceeds max_threads, which is the current size of all the arrays 128 void distributedBarrier::resize(size_t nthr) { 129 KMP_DEBUG_ASSERT(nthr > max_threads); 130 131 // expand to requested size * 2 132 max_threads = nthr * 2; 133 134 // allocate arrays to new max threads 135 for (int i = 0; i < MAX_ITERS; ++i) { 136 if (flags[i]) 137 flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i], 138 max_threads * sizeof(flags_s)); 139 else 140 flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s)); 141 } 142 143 if (go) 144 go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s)); 145 else 146 go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s)); 147 148 if (iter) 149 iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s)); 150 else 151 iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s)); 152 153 if (sleep) 154 sleep = 155 (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s)); 156 else 157 sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s)); 158 } 159 160 // This function is to set all the go flags that threads might be waiting 161 // on, and when blocktime is not infinite, it should be followed by a wake-up 162 // call to each thread 163 kmp_uint64 distributedBarrier::go_release() { 164 kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS; 165 for (size_t j = 0; j < num_gos; j++) { 166 go[j].go.store(next_go); 167 } 168 return next_go; 169 } 170 171 void distributedBarrier::go_reset() { 172 for (size_t j = 0; j < max_threads; ++j) { 173 for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) { 174 flags[i][j].stillNeed = 1; 175 } 176 go[j].go.store(0); 177 iter[j].iter = 0; 178 } 179 } 180 181 // This function inits/re-inits the distributed barrier for a particular number 182 // of threads. If a resize of arrays is needed, it calls the resize function. 183 void distributedBarrier::init(size_t nthr) { 184 size_t old_max = max_threads; 185 if (nthr > max_threads) { // need more space in arrays 186 resize(nthr); 187 } 188 189 for (size_t i = 0; i < max_threads; i++) { 190 for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) { 191 flags[j][i].stillNeed = 1; 192 } 193 go[i].go.store(0); 194 iter[i].iter = 0; 195 if (i >= old_max) 196 sleep[i].sleep = false; 197 } 198 199 // Recalculate num_gos, etc. based on new nthr 200 computeVarsForN(nthr); 201 202 num_threads = nthr; 203 204 if (team_icvs == NULL) 205 team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t)); 206 } 207 208 // This function is used only when KMP_BLOCKTIME is not infinite. 209 // static 210 void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team, 211 size_t start, size_t stop, size_t inc, 212 size_t tid) { 213 KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME); 214 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 215 return; 216 217 kmp_info_t **other_threads = team->t.t_threads; 218 for (size_t thr = start; thr < stop; thr += inc) { 219 KMP_DEBUG_ASSERT(other_threads[thr]); 220 int gtid = other_threads[thr]->th.th_info.ds.ds_gtid; 221 // Wake up worker regardless of if it appears to be sleeping or not 222 __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL); 223 } 224 } 225 226 static void __kmp_dist_barrier_gather( 227 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 228 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 229 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather); 230 kmp_team_t *team; 231 distributedBarrier *b; 232 kmp_info_t **other_threads; 233 kmp_uint64 my_current_iter, my_next_iter; 234 kmp_uint32 nproc; 235 bool group_leader; 236 237 team = this_thr->th.th_team; 238 nproc = this_thr->th.th_team_nproc; 239 other_threads = team->t.t_threads; 240 b = team->t.b; 241 my_current_iter = b->iter[tid].iter; 242 my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS; 243 group_leader = ((tid % b->threads_per_group) == 0); 244 245 KA_TRACE(20, 246 ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n", 247 gtid, team->t.t_id, tid, bt)); 248 249 #if USE_ITT_BUILD && USE_ITT_NOTIFY 250 // Barrier imbalance - save arrive time to the thread 251 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 252 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 253 __itt_get_timestamp(); 254 } 255 #endif 256 257 if (group_leader) { 258 // Start from the thread after the group leader 259 size_t group_start = tid + 1; 260 size_t group_end = tid + b->threads_per_group; 261 size_t threads_pending = 0; 262 263 if (group_end > nproc) 264 group_end = nproc; 265 do { // wait for threads in my group 266 threads_pending = 0; 267 // Check all the flags every time to avoid branch misspredict 268 for (size_t thr = group_start; thr < group_end; thr++) { 269 // Each thread uses a different cache line 270 threads_pending += b->flags[my_current_iter][thr].stillNeed; 271 } 272 // Execute tasks here 273 if (__kmp_tasking_mode != tskm_immediate_exec) { 274 kmp_task_team_t *task_team = this_thr->th.th_task_team; 275 if (task_team != NULL) { 276 if (TCR_SYNC_4(task_team->tt.tt_active)) { 277 if (KMP_TASKING_ENABLED(task_team)) { 278 int tasks_completed = FALSE; 279 __kmp_atomic_execute_tasks_64( 280 this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE, 281 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0); 282 } else 283 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 284 } 285 } else { 286 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 287 } // if 288 } 289 if (TCR_4(__kmp_global.g.g_done)) { 290 if (__kmp_global.g.g_abort) 291 __kmp_abort_thread(); 292 break; 293 } else if (__kmp_tasking_mode != tskm_immediate_exec && 294 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) { 295 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 296 } 297 } while (threads_pending > 0); 298 299 if (reduce) { // Perform reduction if needed 300 OMPT_REDUCTION_DECL(this_thr, gtid); 301 OMPT_REDUCTION_BEGIN; 302 // Group leader reduces all threads in group 303 for (size_t thr = group_start; thr < group_end; thr++) { 304 (*reduce)(this_thr->th.th_local.reduce_data, 305 other_threads[thr]->th.th_local.reduce_data); 306 } 307 OMPT_REDUCTION_END; 308 } 309 310 // Set flag for next iteration 311 b->flags[my_next_iter][tid].stillNeed = 1; 312 // Each thread uses a different cache line; resets stillNeed to 0 to 313 // indicate it has reached the barrier 314 b->flags[my_current_iter][tid].stillNeed = 0; 315 316 do { // wait for all group leaders 317 threads_pending = 0; 318 for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) { 319 threads_pending += b->flags[my_current_iter][thr].stillNeed; 320 } 321 // Execute tasks here 322 if (__kmp_tasking_mode != tskm_immediate_exec) { 323 kmp_task_team_t *task_team = this_thr->th.th_task_team; 324 if (task_team != NULL) { 325 if (TCR_SYNC_4(task_team->tt.tt_active)) { 326 if (KMP_TASKING_ENABLED(task_team)) { 327 int tasks_completed = FALSE; 328 __kmp_atomic_execute_tasks_64( 329 this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE, 330 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0); 331 } else 332 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 333 } 334 } else { 335 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 336 } // if 337 } 338 if (TCR_4(__kmp_global.g.g_done)) { 339 if (__kmp_global.g.g_abort) 340 __kmp_abort_thread(); 341 break; 342 } else if (__kmp_tasking_mode != tskm_immediate_exec && 343 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) { 344 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 345 } 346 } while (threads_pending > 0); 347 348 if (reduce) { // Perform reduction if needed 349 if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders 350 OMPT_REDUCTION_DECL(this_thr, gtid); 351 OMPT_REDUCTION_BEGIN; 352 for (size_t thr = b->threads_per_group; thr < nproc; 353 thr += b->threads_per_group) { 354 (*reduce)(this_thr->th.th_local.reduce_data, 355 other_threads[thr]->th.th_local.reduce_data); 356 } 357 OMPT_REDUCTION_END; 358 } 359 } 360 } else { 361 // Set flag for next iteration 362 b->flags[my_next_iter][tid].stillNeed = 1; 363 // Each thread uses a different cache line; resets stillNeed to 0 to 364 // indicate it has reached the barrier 365 b->flags[my_current_iter][tid].stillNeed = 0; 366 } 367 368 KMP_MFENCE(); 369 370 KA_TRACE(20, 371 ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 372 gtid, team->t.t_id, tid, bt)); 373 } 374 375 static void __kmp_dist_barrier_release( 376 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 377 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 378 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release); 379 kmp_team_t *team; 380 distributedBarrier *b; 381 kmp_bstate_t *thr_bar; 382 kmp_uint64 my_current_iter, next_go; 383 size_t my_go_index; 384 bool group_leader; 385 386 KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n", 387 gtid, tid, bt)); 388 389 thr_bar = &this_thr->th.th_bar[bt].bb; 390 391 if (!KMP_MASTER_TID(tid)) { 392 // workers and non-master group leaders need to check their presence in team 393 do { 394 if (this_thr->th.th_used_in_team.load() != 1 && 395 this_thr->th.th_used_in_team.load() != 3) { 396 // Thread is not in use in a team. Wait on location in tid's thread 397 // struct. The 0 value tells anyone looking that this thread is spinning 398 // or sleeping until this location becomes 3 again; 3 is the transition 399 // state to get to 1 which is waiting on go and being in the team 400 kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3); 401 if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2, 402 0) || 403 this_thr->th.th_used_in_team.load() == 0) { 404 my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj)); 405 } 406 #if USE_ITT_BUILD && USE_ITT_NOTIFY 407 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 408 // In fork barrier where we could not get the object reliably 409 itt_sync_obj = 410 __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 411 // Cancel wait on previous parallel region... 412 __kmp_itt_task_starting(itt_sync_obj); 413 414 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 415 return; 416 417 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 418 if (itt_sync_obj != NULL) 419 // Call prepare as early as possible for "new" barrier 420 __kmp_itt_task_finished(itt_sync_obj); 421 } else 422 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 423 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 424 return; 425 } 426 if (this_thr->th.th_used_in_team.load() != 1 && 427 this_thr->th.th_used_in_team.load() != 3) // spurious wake-up? 428 continue; 429 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 430 return; 431 432 // At this point, the thread thinks it is in use in a team, or in 433 // transition to be used in a team, but it might have reached this barrier 434 // before it was marked unused by the team. Unused threads are awoken and 435 // shifted to wait on local thread struct elsewhere. It also might reach 436 // this point by being picked up for use by a different team. Either way, 437 // we need to update the tid. 438 tid = __kmp_tid_from_gtid(gtid); 439 team = this_thr->th.th_team; 440 KMP_DEBUG_ASSERT(tid >= 0); 441 KMP_DEBUG_ASSERT(team); 442 b = team->t.b; 443 my_current_iter = b->iter[tid].iter; 444 next_go = my_current_iter + distributedBarrier::MAX_ITERS; 445 my_go_index = tid / b->threads_per_go; 446 if (this_thr->th.th_used_in_team.load() == 3) { 447 (void)KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 448 1); 449 } 450 // Check if go flag is set 451 if (b->go[my_go_index].go.load() != next_go) { 452 // Wait on go flag on team 453 kmp_atomic_flag_64<false, true> my_flag( 454 &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep)); 455 my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj)); 456 KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter || 457 b->iter[tid].iter == 0); 458 KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false); 459 } 460 461 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 462 return; 463 // At this point, the thread's go location was set. This means the primary 464 // thread is safely in the barrier, and so this thread's data is 465 // up-to-date, but we should check again that this thread is really in 466 // use in the team, as it could have been woken up for the purpose of 467 // changing team size, or reaping threads at shutdown. 468 if (this_thr->th.th_used_in_team.load() == 1) 469 break; 470 } while (1); 471 472 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 473 return; 474 475 group_leader = ((tid % b->threads_per_group) == 0); 476 if (group_leader) { 477 // Tell all the threads in my group they can go! 478 for (size_t go_idx = my_go_index + 1; 479 go_idx < my_go_index + b->gos_per_group; go_idx++) { 480 b->go[go_idx].go.store(next_go); 481 } 482 // Fence added so that workers can see changes to go. sfence inadequate. 483 KMP_MFENCE(); 484 } 485 486 #if KMP_BARRIER_ICV_PUSH 487 if (propagate_icvs) { // copy ICVs to final dest 488 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, 489 tid, FALSE); 490 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 491 (kmp_internal_control_t *)team->t.b->team_icvs); 492 copy_icvs(&thr_bar->th_fixed_icvs, 493 &team->t.t_implicit_task_taskdata[tid].td_icvs); 494 } 495 #endif 496 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) { 497 // This thread is now awake and participating in the barrier; 498 // wake up the other threads in the group 499 size_t nproc = this_thr->th.th_team_nproc; 500 size_t group_end = tid + b->threads_per_group; 501 if (nproc < group_end) 502 group_end = nproc; 503 __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid); 504 } 505 } else { // Primary thread 506 team = this_thr->th.th_team; 507 b = team->t.b; 508 my_current_iter = b->iter[tid].iter; 509 next_go = my_current_iter + distributedBarrier::MAX_ITERS; 510 #if KMP_BARRIER_ICV_PUSH 511 if (propagate_icvs) { 512 // primary thread has ICVs in final destination; copy 513 copy_icvs(&thr_bar->th_fixed_icvs, 514 &team->t.t_implicit_task_taskdata[tid].td_icvs); 515 } 516 #endif 517 // Tell all the group leaders they can go! 518 for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) { 519 b->go[go_idx].go.store(next_go); 520 } 521 522 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 523 // Wake-up the group leaders 524 size_t nproc = this_thr->th.th_team_nproc; 525 __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc, 526 b->threads_per_group, tid); 527 } 528 529 // Tell all the threads in my group they can go! 530 for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) { 531 b->go[go_idx].go.store(next_go); 532 } 533 534 // Fence added so that workers can see changes to go. sfence inadequate. 535 KMP_MFENCE(); 536 537 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 538 // Wake-up the other threads in my group 539 size_t nproc = this_thr->th.th_team_nproc; 540 size_t group_end = tid + b->threads_per_group; 541 if (nproc < group_end) 542 group_end = nproc; 543 __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid); 544 } 545 } 546 // Update to next iteration 547 KMP_ASSERT(my_current_iter == b->iter[tid].iter); 548 b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS; 549 550 KA_TRACE( 551 20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 552 gtid, team->t.t_id, tid, bt)); 553 } 554 555 // Linear Barrier 556 template <bool cancellable = false> 557 static bool __kmp_linear_barrier_gather_template( 558 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 559 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 560 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather); 561 kmp_team_t *team = this_thr->th.th_team; 562 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 563 kmp_info_t **other_threads = team->t.t_threads; 564 565 KA_TRACE( 566 20, 567 ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 568 gtid, team->t.t_id, tid, bt)); 569 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 570 571 #if USE_ITT_BUILD && USE_ITT_NOTIFY 572 // Barrier imbalance - save arrive time to the thread 573 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 574 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 575 __itt_get_timestamp(); 576 } 577 #endif 578 // We now perform a linear reduction to signal that all of the threads have 579 // arrived. 580 if (!KMP_MASTER_TID(tid)) { 581 KA_TRACE(20, 582 ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)" 583 "arrived(%p): %llu => %llu\n", 584 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team), 585 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived, 586 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 587 // Mark arrival to primary thread 588 /* After performing this write, a worker thread may not assume that the team 589 is valid any more - it could be deallocated by the primary thread at any 590 time. */ 591 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]); 592 flag.release(); 593 } else { 594 kmp_balign_team_t *team_bar = &team->t.t_bar[bt]; 595 int nproc = this_thr->th.th_team_nproc; 596 int i; 597 // Don't have to worry about sleep bit here or atomic since team setting 598 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP; 599 600 // Collect all the worker team member threads. 601 for (i = 1; i < nproc; ++i) { 602 #if KMP_CACHE_MANAGE 603 // Prefetch next thread's arrived count 604 if (i + 1 < nproc) 605 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived); 606 #endif /* KMP_CACHE_MANAGE */ 607 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " 608 "arrived(%p) == %llu\n", 609 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), 610 team->t.t_id, i, 611 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state)); 612 613 // Wait for worker thread to arrive 614 if (cancellable) { 615 kmp_flag_64<true, false> flag( 616 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state); 617 if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj))) 618 return true; 619 } else { 620 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, 621 new_state); 622 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 623 } 624 #if USE_ITT_BUILD && USE_ITT_NOTIFY 625 // Barrier imbalance - write min of the thread time and the other thread 626 // time to the thread. 627 if (__kmp_forkjoin_frames_mode == 2) { 628 this_thr->th.th_bar_min_time = KMP_MIN( 629 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time); 630 } 631 #endif 632 if (reduce) { 633 KA_TRACE(100, 634 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", 635 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), 636 team->t.t_id, i)); 637 OMPT_REDUCTION_DECL(this_thr, gtid); 638 OMPT_REDUCTION_BEGIN; 639 (*reduce)(this_thr->th.th_local.reduce_data, 640 other_threads[i]->th.th_local.reduce_data); 641 OMPT_REDUCTION_END; 642 } 643 } 644 // Don't have to worry about sleep bit here or atomic since team setting 645 team_bar->b_arrived = new_state; 646 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d " 647 "arrived(%p) = %llu\n", 648 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, 649 new_state)); 650 } 651 KA_TRACE( 652 20, 653 ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 654 gtid, team->t.t_id, tid, bt)); 655 return false; 656 } 657 658 template <bool cancellable = false> 659 static bool __kmp_linear_barrier_release_template( 660 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 661 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 662 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release); 663 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 664 kmp_team_t *team; 665 666 if (KMP_MASTER_TID(tid)) { 667 unsigned int i; 668 kmp_uint32 nproc = this_thr->th.th_team_nproc; 669 kmp_info_t **other_threads; 670 671 team = __kmp_threads[gtid]->th.th_team; 672 KMP_DEBUG_ASSERT(team != NULL); 673 other_threads = team->t.t_threads; 674 675 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for " 676 "barrier type %d\n", 677 gtid, team->t.t_id, tid, bt)); 678 679 if (nproc > 1) { 680 #if KMP_BARRIER_ICV_PUSH 681 { 682 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 683 if (propagate_icvs) { 684 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs); 685 for (i = 1; i < nproc; ++i) { 686 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], 687 team, i, FALSE); 688 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, 689 &team->t.t_implicit_task_taskdata[0].td_icvs); 690 } 691 ngo_sync(); 692 } 693 } 694 #endif // KMP_BARRIER_ICV_PUSH 695 696 // Now, release all of the worker threads 697 for (i = 1; i < nproc; ++i) { 698 #if KMP_CACHE_MANAGE 699 // Prefetch next thread's go flag 700 if (i + 1 < nproc) 701 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go); 702 #endif /* KMP_CACHE_MANAGE */ 703 KA_TRACE( 704 20, 705 ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) " 706 "go(%p): %u => %u\n", 707 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid, 708 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go, 709 other_threads[i]->th.th_bar[bt].bb.b_go, 710 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP)); 711 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go, 712 other_threads[i]); 713 flag.release(); 714 } 715 } 716 } else { // Wait for the PRIMARY thread to release us 717 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n", 718 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 719 if (cancellable) { 720 kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 721 if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj))) 722 return true; 723 } else { 724 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 725 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 726 } 727 #if USE_ITT_BUILD && USE_ITT_NOTIFY 728 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 729 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is 730 // disabled) 731 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 732 // Cancel wait on previous parallel region... 733 __kmp_itt_task_starting(itt_sync_obj); 734 735 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 736 return false; 737 738 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 739 if (itt_sync_obj != NULL) 740 // Call prepare as early as possible for "new" barrier 741 __kmp_itt_task_finished(itt_sync_obj); 742 } else 743 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 744 // Early exit for reaping threads releasing forkjoin barrier 745 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 746 return false; 747 // The worker thread may now assume that the team is valid. 748 #ifdef KMP_DEBUG 749 tid = __kmp_tid_from_gtid(gtid); 750 team = __kmp_threads[gtid]->th.th_team; 751 #endif 752 KMP_DEBUG_ASSERT(team != NULL); 753 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 754 KA_TRACE(20, 755 ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 756 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 757 KMP_MB(); // Flush all pending memory write invalidates. 758 } 759 KA_TRACE( 760 20, 761 ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 762 gtid, team->t.t_id, tid, bt)); 763 return false; 764 } 765 766 static void __kmp_linear_barrier_gather( 767 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 768 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 769 __kmp_linear_barrier_gather_template<false>( 770 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 771 } 772 773 static bool __kmp_linear_barrier_gather_cancellable( 774 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 775 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 776 return __kmp_linear_barrier_gather_template<true>( 777 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 778 } 779 780 static void __kmp_linear_barrier_release( 781 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 782 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 783 __kmp_linear_barrier_release_template<false>( 784 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); 785 } 786 787 static bool __kmp_linear_barrier_release_cancellable( 788 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 789 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 790 return __kmp_linear_barrier_release_template<true>( 791 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); 792 } 793 794 // Tree barrier 795 static void __kmp_tree_barrier_gather( 796 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 797 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 798 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather); 799 kmp_team_t *team = this_thr->th.th_team; 800 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 801 kmp_info_t **other_threads = team->t.t_threads; 802 kmp_uint32 nproc = this_thr->th.th_team_nproc; 803 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 804 kmp_uint32 branch_factor = 1 << branch_bits; 805 kmp_uint32 child; 806 kmp_uint32 child_tid; 807 kmp_uint64 new_state = 0; 808 809 KA_TRACE( 810 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 811 gtid, team->t.t_id, tid, bt)); 812 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 813 814 #if USE_ITT_BUILD && USE_ITT_NOTIFY 815 // Barrier imbalance - save arrive time to the thread 816 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 817 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 818 __itt_get_timestamp(); 819 } 820 #endif 821 // Perform tree gather to wait until all threads have arrived; reduce any 822 // required data as we go 823 child_tid = (tid << branch_bits) + 1; 824 if (child_tid < nproc) { 825 // Parent threads wait for all their children to arrive 826 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 827 child = 1; 828 do { 829 kmp_info_t *child_thr = other_threads[child_tid]; 830 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 831 #if KMP_CACHE_MANAGE 832 // Prefetch next thread's arrived count 833 if (child + 1 <= branch_factor && child_tid + 1 < nproc) 834 KMP_CACHE_PREFETCH( 835 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived); 836 #endif /* KMP_CACHE_MANAGE */ 837 KA_TRACE(20, 838 ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 839 "arrived(%p) == %llu\n", 840 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 841 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 842 // Wait for child to arrive 843 kmp_flag_64<> flag(&child_bar->b_arrived, new_state); 844 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 845 #if USE_ITT_BUILD && USE_ITT_NOTIFY 846 // Barrier imbalance - write min of the thread time and a child time to 847 // the thread. 848 if (__kmp_forkjoin_frames_mode == 2) { 849 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 850 child_thr->th.th_bar_min_time); 851 } 852 #endif 853 if (reduce) { 854 KA_TRACE(100, 855 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 856 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 857 team->t.t_id, child_tid)); 858 OMPT_REDUCTION_DECL(this_thr, gtid); 859 OMPT_REDUCTION_BEGIN; 860 (*reduce)(this_thr->th.th_local.reduce_data, 861 child_thr->th.th_local.reduce_data); 862 OMPT_REDUCTION_END; 863 } 864 child++; 865 child_tid++; 866 } while (child <= branch_factor && child_tid < nproc); 867 } 868 869 if (!KMP_MASTER_TID(tid)) { // Worker threads 870 kmp_int32 parent_tid = (tid - 1) >> branch_bits; 871 872 KA_TRACE(20, 873 ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 874 "arrived(%p): %llu => %llu\n", 875 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), 876 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, 877 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 878 879 // Mark arrival to parent thread 880 /* After performing this write, a worker thread may not assume that the team 881 is valid any more - it could be deallocated by the primary thread at any 882 time. */ 883 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]); 884 flag.release(); 885 } else { 886 // Need to update the team arrived pointer if we are the primary thread 887 if (nproc > 1) // New value was already computed above 888 team->t.t_bar[bt].b_arrived = new_state; 889 else 890 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 891 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d " 892 "arrived(%p) = %llu\n", 893 gtid, team->t.t_id, tid, team->t.t_id, 894 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 895 } 896 KA_TRACE(20, 897 ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 898 gtid, team->t.t_id, tid, bt)); 899 } 900 901 static void __kmp_tree_barrier_release( 902 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 903 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 904 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release); 905 kmp_team_t *team; 906 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 907 kmp_uint32 nproc; 908 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; 909 kmp_uint32 branch_factor = 1 << branch_bits; 910 kmp_uint32 child; 911 kmp_uint32 child_tid; 912 913 // Perform a tree release for all of the threads that have been gathered 914 if (!KMP_MASTER_TID( 915 tid)) { // Handle fork barrier workers who aren't part of a team yet 916 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid, 917 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 918 // Wait for parent thread to release us 919 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 920 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 921 #if USE_ITT_BUILD && USE_ITT_NOTIFY 922 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 923 // In fork barrier where we could not get the object reliably (or 924 // ITTNOTIFY is disabled) 925 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 926 // Cancel wait on previous parallel region... 927 __kmp_itt_task_starting(itt_sync_obj); 928 929 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 930 return; 931 932 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 933 if (itt_sync_obj != NULL) 934 // Call prepare as early as possible for "new" barrier 935 __kmp_itt_task_finished(itt_sync_obj); 936 } else 937 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 938 // Early exit for reaping threads releasing forkjoin barrier 939 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 940 return; 941 942 // The worker thread may now assume that the team is valid. 943 team = __kmp_threads[gtid]->th.th_team; 944 KMP_DEBUG_ASSERT(team != NULL); 945 tid = __kmp_tid_from_gtid(gtid); 946 947 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 948 KA_TRACE(20, 949 ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid, 950 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 951 KMP_MB(); // Flush all pending memory write invalidates. 952 } else { 953 team = __kmp_threads[gtid]->th.th_team; 954 KMP_DEBUG_ASSERT(team != NULL); 955 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for " 956 "barrier type %d\n", 957 gtid, team->t.t_id, tid, bt)); 958 } 959 nproc = this_thr->th.th_team_nproc; 960 child_tid = (tid << branch_bits) + 1; 961 962 if (child_tid < nproc) { 963 kmp_info_t **other_threads = team->t.t_threads; 964 child = 1; 965 // Parent threads release all their children 966 do { 967 kmp_info_t *child_thr = other_threads[child_tid]; 968 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 969 #if KMP_CACHE_MANAGE 970 // Prefetch next thread's go count 971 if (child + 1 <= branch_factor && child_tid + 1 < nproc) 972 KMP_CACHE_PREFETCH( 973 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go); 974 #endif /* KMP_CACHE_MANAGE */ 975 976 #if KMP_BARRIER_ICV_PUSH 977 { 978 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 979 if (propagate_icvs) { 980 __kmp_init_implicit_task(team->t.t_ident, 981 team->t.t_threads[child_tid], team, 982 child_tid, FALSE); 983 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, 984 &team->t.t_implicit_task_taskdata[0].td_icvs); 985 } 986 } 987 #endif // KMP_BARRIER_ICV_PUSH 988 KA_TRACE(20, 989 ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 990 "go(%p): %u => %u\n", 991 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 992 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 993 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 994 // Release child from barrier 995 kmp_flag_64<> flag(&child_bar->b_go, child_thr); 996 flag.release(); 997 child++; 998 child_tid++; 999 } while (child <= branch_factor && child_tid < nproc); 1000 } 1001 KA_TRACE( 1002 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 1003 gtid, team->t.t_id, tid, bt)); 1004 } 1005 1006 // Hyper Barrier 1007 static void __kmp_hyper_barrier_gather( 1008 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 1009 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 1010 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather); 1011 kmp_team_t *team = this_thr->th.th_team; 1012 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 1013 kmp_info_t **other_threads = team->t.t_threads; 1014 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE; 1015 kmp_uint32 num_threads = this_thr->th.th_team_nproc; 1016 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; 1017 kmp_uint32 branch_factor = 1 << branch_bits; 1018 kmp_uint32 offset; 1019 kmp_uint32 level; 1020 1021 KA_TRACE( 1022 20, 1023 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", 1024 gtid, team->t.t_id, tid, bt)); 1025 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 1026 1027 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1028 // Barrier imbalance - save arrive time to the thread 1029 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 1030 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = 1031 __itt_get_timestamp(); 1032 } 1033 #endif 1034 /* Perform a hypercube-embedded tree gather to wait until all of the threads 1035 have arrived, and reduce any required data as we go. */ 1036 kmp_flag_64<> p_flag(&thr_bar->b_arrived); 1037 for (level = 0, offset = 1; offset < num_threads; 1038 level += branch_bits, offset <<= branch_bits) { 1039 kmp_uint32 child; 1040 kmp_uint32 child_tid; 1041 1042 if (((tid >> level) & (branch_factor - 1)) != 0) { 1043 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1); 1044 1045 KMP_MB(); // Synchronize parent and child threads. 1046 KA_TRACE(20, 1047 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " 1048 "arrived(%p): %llu => %llu\n", 1049 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), 1050 team->t.t_id, parent_tid, &thr_bar->b_arrived, 1051 thr_bar->b_arrived, 1052 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 1053 // Mark arrival to parent thread 1054 /* After performing this write (in the last iteration of the enclosing for 1055 loop), a worker thread may not assume that the team is valid any more 1056 - it could be deallocated by the primary thread at any time. */ 1057 p_flag.set_waiter(other_threads[parent_tid]); 1058 p_flag.release(); 1059 break; 1060 } 1061 1062 // Parent threads wait for children to arrive 1063 if (new_state == KMP_BARRIER_UNUSED_STATE) 1064 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 1065 for (child = 1, child_tid = tid + (1 << level); 1066 child < branch_factor && child_tid < num_threads; 1067 child++, child_tid += (1 << level)) { 1068 kmp_info_t *child_thr = other_threads[child_tid]; 1069 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1070 #if KMP_CACHE_MANAGE 1071 kmp_uint32 next_child_tid = child_tid + (1 << level); 1072 // Prefetch next thread's arrived count 1073 if (child + 1 < branch_factor && next_child_tid < num_threads) 1074 KMP_CACHE_PREFETCH( 1075 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived); 1076 #endif /* KMP_CACHE_MANAGE */ 1077 KA_TRACE(20, 1078 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " 1079 "arrived(%p) == %llu\n", 1080 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1081 team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); 1082 // Wait for child to arrive 1083 kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state); 1084 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1085 KMP_MB(); // Synchronize parent and child threads. 1086 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1087 // Barrier imbalance - write min of the thread time and a child time to 1088 // the thread. 1089 if (__kmp_forkjoin_frames_mode == 2) { 1090 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, 1091 child_thr->th.th_bar_min_time); 1092 } 1093 #endif 1094 if (reduce) { 1095 KA_TRACE(100, 1096 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", 1097 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1098 team->t.t_id, child_tid)); 1099 OMPT_REDUCTION_DECL(this_thr, gtid); 1100 OMPT_REDUCTION_BEGIN; 1101 (*reduce)(this_thr->th.th_local.reduce_data, 1102 child_thr->th.th_local.reduce_data); 1103 OMPT_REDUCTION_END; 1104 } 1105 } 1106 } 1107 1108 if (KMP_MASTER_TID(tid)) { 1109 // Need to update the team arrived pointer if we are the primary thread 1110 if (new_state == KMP_BARRIER_UNUSED_STATE) 1111 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; 1112 else 1113 team->t.t_bar[bt].b_arrived = new_state; 1114 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d " 1115 "arrived(%p) = %llu\n", 1116 gtid, team->t.t_id, tid, team->t.t_id, 1117 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 1118 } 1119 KA_TRACE( 1120 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", 1121 gtid, team->t.t_id, tid, bt)); 1122 } 1123 1124 // The reverse versions seem to beat the forward versions overall 1125 #define KMP_REVERSE_HYPER_BAR 1126 static void __kmp_hyper_barrier_release( 1127 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 1128 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 1129 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release); 1130 kmp_team_t *team; 1131 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 1132 kmp_info_t **other_threads; 1133 kmp_uint32 num_threads; 1134 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; 1135 kmp_uint32 branch_factor = 1 << branch_bits; 1136 kmp_uint32 child; 1137 kmp_uint32 child_tid; 1138 kmp_uint32 offset; 1139 kmp_uint32 level; 1140 1141 /* Perform a hypercube-embedded tree release for all of the threads that have 1142 been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads 1143 are released in the reverse order of the corresponding gather, otherwise 1144 threads are released in the same order. */ 1145 if (KMP_MASTER_TID(tid)) { // primary thread 1146 team = __kmp_threads[gtid]->th.th_team; 1147 KMP_DEBUG_ASSERT(team != NULL); 1148 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for " 1149 "barrier type %d\n", 1150 gtid, team->t.t_id, tid, bt)); 1151 #if KMP_BARRIER_ICV_PUSH 1152 if (propagate_icvs) { // primary already has ICVs in final destination; copy 1153 copy_icvs(&thr_bar->th_fixed_icvs, 1154 &team->t.t_implicit_task_taskdata[tid].td_icvs); 1155 } 1156 #endif 1157 } else { // Handle fork barrier workers who aren't part of a team yet 1158 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid, 1159 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); 1160 // Wait for parent thread to release us 1161 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 1162 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1163 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1164 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { 1165 // In fork barrier where we could not get the object reliably 1166 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); 1167 // Cancel wait on previous parallel region... 1168 __kmp_itt_task_starting(itt_sync_obj); 1169 1170 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 1171 return; 1172 1173 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 1174 if (itt_sync_obj != NULL) 1175 // Call prepare as early as possible for "new" barrier 1176 __kmp_itt_task_finished(itt_sync_obj); 1177 } else 1178 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1179 // Early exit for reaping threads releasing forkjoin barrier 1180 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 1181 return; 1182 1183 // The worker thread may now assume that the team is valid. 1184 team = __kmp_threads[gtid]->th.th_team; 1185 KMP_DEBUG_ASSERT(team != NULL); 1186 tid = __kmp_tid_from_gtid(gtid); 1187 1188 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); 1189 KA_TRACE(20, 1190 ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 1191 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 1192 KMP_MB(); // Flush all pending memory write invalidates. 1193 } 1194 num_threads = this_thr->th.th_team_nproc; 1195 other_threads = team->t.t_threads; 1196 1197 #ifdef KMP_REVERSE_HYPER_BAR 1198 // Count up to correct level for parent 1199 for (level = 0, offset = 1; 1200 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0); 1201 level += branch_bits, offset <<= branch_bits) 1202 ; 1203 1204 // Now go down from there 1205 for (level -= branch_bits, offset >>= branch_bits; offset != 0; 1206 level -= branch_bits, offset >>= branch_bits) 1207 #else 1208 // Go down the tree, level by level 1209 for (level = 0, offset = 1; offset < num_threads; 1210 level += branch_bits, offset <<= branch_bits) 1211 #endif // KMP_REVERSE_HYPER_BAR 1212 { 1213 #ifdef KMP_REVERSE_HYPER_BAR 1214 /* Now go in reverse order through the children, highest to lowest. 1215 Initial setting of child is conservative here. */ 1216 child = num_threads >> ((level == 0) ? level : level - 1); 1217 for (child = (child < branch_factor - 1) ? child : branch_factor - 1, 1218 child_tid = tid + (child << level); 1219 child >= 1; child--, child_tid -= (1 << level)) 1220 #else 1221 if (((tid >> level) & (branch_factor - 1)) != 0) 1222 // No need to go lower than this, since this is the level parent would be 1223 // notified 1224 break; 1225 // Iterate through children on this level of the tree 1226 for (child = 1, child_tid = tid + (1 << level); 1227 child < branch_factor && child_tid < num_threads; 1228 child++, child_tid += (1 << level)) 1229 #endif // KMP_REVERSE_HYPER_BAR 1230 { 1231 if (child_tid >= num_threads) 1232 continue; // Child doesn't exist so keep going 1233 else { 1234 kmp_info_t *child_thr = other_threads[child_tid]; 1235 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1236 #if KMP_CACHE_MANAGE 1237 kmp_uint32 next_child_tid = child_tid - (1 << level); 1238 // Prefetch next thread's go count 1239 #ifdef KMP_REVERSE_HYPER_BAR 1240 if (child - 1 >= 1 && next_child_tid < num_threads) 1241 #else 1242 if (child + 1 < branch_factor && next_child_tid < num_threads) 1243 #endif // KMP_REVERSE_HYPER_BAR 1244 KMP_CACHE_PREFETCH( 1245 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go); 1246 #endif /* KMP_CACHE_MANAGE */ 1247 1248 #if KMP_BARRIER_ICV_PUSH 1249 if (propagate_icvs) // push my fixed ICVs to my child 1250 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 1251 #endif // KMP_BARRIER_ICV_PUSH 1252 1253 KA_TRACE( 1254 20, 1255 ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" 1256 "go(%p): %u => %u\n", 1257 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1258 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 1259 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1260 // Release child from barrier 1261 kmp_flag_64<> flag(&child_bar->b_go, child_thr); 1262 flag.release(); 1263 } 1264 } 1265 } 1266 #if KMP_BARRIER_ICV_PUSH 1267 if (propagate_icvs && 1268 !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest 1269 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, 1270 FALSE); 1271 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1272 &thr_bar->th_fixed_icvs); 1273 } 1274 #endif 1275 KA_TRACE( 1276 20, 1277 ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", 1278 gtid, team->t.t_id, tid, bt)); 1279 } 1280 1281 // Hierarchical Barrier 1282 1283 // Initialize thread barrier data 1284 /* Initializes/re-initializes the hierarchical barrier data stored on a thread. 1285 Performs the minimum amount of initialization required based on how the team 1286 has changed. Returns true if leaf children will require both on-core and 1287 traditional wake-up mechanisms. For example, if the team size increases, 1288 threads already in the team will respond to on-core wakeup on their parent 1289 thread, but threads newly added to the team will only be listening on the 1290 their local b_go. */ 1291 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, 1292 kmp_bstate_t *thr_bar, 1293 kmp_uint32 nproc, int gtid, 1294 int tid, kmp_team_t *team) { 1295 // Checks to determine if (re-)initialization is needed 1296 bool uninitialized = thr_bar->team == NULL; 1297 bool team_changed = team != thr_bar->team; 1298 bool team_sz_changed = nproc != thr_bar->nproc; 1299 bool tid_changed = tid != thr_bar->old_tid; 1300 bool retval = false; 1301 1302 if (uninitialized || team_sz_changed) { 1303 __kmp_get_hierarchy(nproc, thr_bar); 1304 } 1305 1306 if (uninitialized || team_sz_changed || tid_changed) { 1307 thr_bar->my_level = thr_bar->depth - 1; // default for primary thread 1308 thr_bar->parent_tid = -1; // default for primary thread 1309 if (!KMP_MASTER_TID(tid)) { 1310 // if not primary thread, find parent thread in hierarchy 1311 kmp_uint32 d = 0; 1312 while (d < thr_bar->depth) { // find parent based on level of thread in 1313 // hierarchy, and note level 1314 kmp_uint32 rem; 1315 if (d == thr_bar->depth - 2) { // reached level right below the primary 1316 thr_bar->parent_tid = 0; 1317 thr_bar->my_level = d; 1318 break; 1319 } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) { 1320 // TODO: can we make the above op faster? 1321 // thread is not a subtree root at next level, so this is max 1322 thr_bar->parent_tid = tid - rem; 1323 thr_bar->my_level = d; 1324 break; 1325 } 1326 ++d; 1327 } 1328 } 1329 __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) / 1330 (thr_bar->skip_per_level[thr_bar->my_level])), 1331 &(thr_bar->offset)); 1332 thr_bar->old_tid = tid; 1333 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 1334 thr_bar->team = team; 1335 thr_bar->parent_bar = 1336 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 1337 } 1338 if (uninitialized || team_changed || tid_changed) { 1339 thr_bar->team = team; 1340 thr_bar->parent_bar = 1341 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; 1342 retval = true; 1343 } 1344 if (uninitialized || team_sz_changed || tid_changed) { 1345 thr_bar->nproc = nproc; 1346 thr_bar->leaf_kids = thr_bar->base_leaf_kids; 1347 if (thr_bar->my_level == 0) 1348 thr_bar->leaf_kids = 0; 1349 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc) 1350 __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids)); 1351 thr_bar->leaf_state = 0; 1352 for (int i = 0; i < thr_bar->leaf_kids; ++i) 1353 ((char *)&(thr_bar->leaf_state))[7 - i] = 1; 1354 } 1355 return retval; 1356 } 1357 1358 static void __kmp_hierarchical_barrier_gather( 1359 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 1360 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 1361 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather); 1362 kmp_team_t *team = this_thr->th.th_team; 1363 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 1364 kmp_uint32 nproc = this_thr->th.th_team_nproc; 1365 kmp_info_t **other_threads = team->t.t_threads; 1366 kmp_uint64 new_state = 0; 1367 1368 int level = team->t.t_level; 1369 if (other_threads[0] 1370 ->th.th_teams_microtask) // are we inside the teams construct? 1371 if (this_thr->th.th_teams_size.nteams > 1) 1372 ++level; // level was not increased in teams construct for team_of_masters 1373 if (level == 1) 1374 thr_bar->use_oncore_barrier = 1; 1375 else 1376 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 1377 1378 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for " 1379 "barrier type %d\n", 1380 gtid, team->t.t_id, tid, bt)); 1381 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); 1382 1383 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1384 // Barrier imbalance - save arrive time to the thread 1385 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { 1386 this_thr->th.th_bar_arrive_time = __itt_get_timestamp(); 1387 } 1388 #endif 1389 1390 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, 1391 team); 1392 1393 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf) 1394 kmp_int32 child_tid; 1395 new_state = 1396 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 1397 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 1398 thr_bar->use_oncore_barrier) { 1399 if (thr_bar->leaf_kids) { 1400 // First, wait for leaf children to check-in on my b_arrived flag 1401 kmp_uint64 leaf_state = 1402 KMP_MASTER_TID(tid) 1403 ? thr_bar->b_arrived | thr_bar->leaf_state 1404 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state; 1405 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting " 1406 "for leaf kids\n", 1407 gtid, team->t.t_id, tid)); 1408 kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state); 1409 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1410 if (reduce) { 1411 OMPT_REDUCTION_DECL(this_thr, gtid); 1412 OMPT_REDUCTION_BEGIN; 1413 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids; 1414 ++child_tid) { 1415 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 1416 "T#%d(%d:%d)\n", 1417 gtid, team->t.t_id, tid, 1418 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1419 child_tid)); 1420 (*reduce)(this_thr->th.th_local.reduce_data, 1421 other_threads[child_tid]->th.th_local.reduce_data); 1422 } 1423 OMPT_REDUCTION_END; 1424 } 1425 // clear leaf_state bits 1426 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state)); 1427 } 1428 // Next, wait for higher level children on each child's b_arrived flag 1429 for (kmp_uint32 d = 1; d < thr_bar->my_level; 1430 ++d) { // gather lowest level threads first, but skip 0 1431 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], 1432 skip = thr_bar->skip_per_level[d]; 1433 if (last > nproc) 1434 last = nproc; 1435 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 1436 kmp_info_t *child_thr = other_threads[child_tid]; 1437 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1438 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 1439 "T#%d(%d:%d) " 1440 "arrived(%p) == %llu\n", 1441 gtid, team->t.t_id, tid, 1442 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1443 child_tid, &child_bar->b_arrived, new_state)); 1444 kmp_flag_64<> flag(&child_bar->b_arrived, new_state); 1445 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1446 if (reduce) { 1447 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 1448 "T#%d(%d:%d)\n", 1449 gtid, team->t.t_id, tid, 1450 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1451 child_tid)); 1452 (*reduce)(this_thr->th.th_local.reduce_data, 1453 child_thr->th.th_local.reduce_data); 1454 } 1455 } 1456 } 1457 } else { // Blocktime is not infinite 1458 for (kmp_uint32 d = 0; d < thr_bar->my_level; 1459 ++d) { // Gather lowest level threads first 1460 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], 1461 skip = thr_bar->skip_per_level[d]; 1462 if (last > nproc) 1463 last = nproc; 1464 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 1465 kmp_info_t *child_thr = other_threads[child_tid]; 1466 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1467 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " 1468 "T#%d(%d:%d) " 1469 "arrived(%p) == %llu\n", 1470 gtid, team->t.t_id, tid, 1471 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1472 child_tid, &child_bar->b_arrived, new_state)); 1473 kmp_flag_64<> flag(&child_bar->b_arrived, new_state); 1474 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 1475 if (reduce) { 1476 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " 1477 "T#%d(%d:%d)\n", 1478 gtid, team->t.t_id, tid, 1479 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1480 child_tid)); 1481 (*reduce)(this_thr->th.th_local.reduce_data, 1482 child_thr->th.th_local.reduce_data); 1483 } 1484 } 1485 } 1486 } 1487 } 1488 // All subordinates are gathered; now release parent if not primary thread 1489 1490 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy 1491 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing" 1492 " T#%d(%d:%d) arrived(%p): %llu => %llu\n", 1493 gtid, team->t.t_id, tid, 1494 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, 1495 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, 1496 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); 1497 /* Mark arrival to parent: After performing this write, a worker thread may 1498 not assume that the team is valid any more - it could be deallocated by 1499 the primary thread at any time. */ 1500 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || 1501 !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived 1502 // flag; release it 1503 kmp_flag_64<> flag(&thr_bar->b_arrived, 1504 other_threads[thr_bar->parent_tid]); 1505 flag.release(); 1506 } else { 1507 // Leaf does special release on "offset" bits of parent's b_arrived flag 1508 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; 1509 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, 1510 thr_bar->offset + 1); 1511 flag.set_waiter(other_threads[thr_bar->parent_tid]); 1512 flag.release(); 1513 } 1514 } else { // Primary thread needs to update the team's b_arrived value 1515 team->t.t_bar[bt].b_arrived = new_state; 1516 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d " 1517 "arrived(%p) = %llu\n", 1518 gtid, team->t.t_id, tid, team->t.t_id, 1519 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); 1520 } 1521 // Is the team access below unsafe or just technically invalid? 1522 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for " 1523 "barrier type %d\n", 1524 gtid, team->t.t_id, tid, bt)); 1525 } 1526 1527 static void __kmp_hierarchical_barrier_release( 1528 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, 1529 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { 1530 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release); 1531 kmp_team_t *team; 1532 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; 1533 kmp_uint32 nproc; 1534 bool team_change = false; // indicates on-core barrier shouldn't be used 1535 1536 if (KMP_MASTER_TID(tid)) { 1537 team = __kmp_threads[gtid]->th.th_team; 1538 KMP_DEBUG_ASSERT(team != NULL); 1539 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary " 1540 "entered barrier type %d\n", 1541 gtid, team->t.t_id, tid, bt)); 1542 } else { // Worker threads 1543 // Wait for parent thread to release me 1544 if (!thr_bar->use_oncore_barrier || 1545 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 || 1546 thr_bar->team == NULL) { 1547 // Use traditional method of waiting on my own b_go flag 1548 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG; 1549 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); 1550 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 1551 TCW_8(thr_bar->b_go, 1552 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1553 } else { // Thread barrier data is initialized, this is a leaf, blocktime is 1554 // infinite, not nested 1555 // Wait on my "offset" bits on parent's b_go flag 1556 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG; 1557 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, 1558 thr_bar->offset + 1, bt, 1559 this_thr USE_ITT_BUILD_ARG(itt_sync_obj)); 1560 flag.wait(this_thr, TRUE); 1561 if (thr_bar->wait_flag == 1562 KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go 1563 TCW_8(thr_bar->b_go, 1564 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1565 } else { // Reset my bits on parent's b_go flag 1566 (RCAST(volatile char *, 1567 &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0; 1568 } 1569 } 1570 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; 1571 // Early exit for reaping threads releasing forkjoin barrier 1572 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) 1573 return; 1574 // The worker thread may now assume that the team is valid. 1575 team = __kmp_threads[gtid]->th.th_team; 1576 KMP_DEBUG_ASSERT(team != NULL); 1577 tid = __kmp_tid_from_gtid(gtid); 1578 1579 KA_TRACE( 1580 20, 1581 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", 1582 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); 1583 KMP_MB(); // Flush all pending memory write invalidates. 1584 } 1585 1586 nproc = this_thr->th.th_team_nproc; 1587 int level = team->t.t_level; 1588 if (team->t.t_threads[0] 1589 ->th.th_teams_microtask) { // are we inside the teams construct? 1590 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 1591 this_thr->th.th_teams_level == level) 1592 ++level; // level was not increased in teams construct for team_of_workers 1593 if (this_thr->th.th_teams_size.nteams > 1) 1594 ++level; // level was not increased in teams construct for team_of_masters 1595 } 1596 if (level == 1) 1597 thr_bar->use_oncore_barrier = 1; 1598 else 1599 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested 1600 1601 // If the team size has increased, we still communicate with old leaves via 1602 // oncore barrier. 1603 unsigned short int old_leaf_kids = thr_bar->leaf_kids; 1604 kmp_uint64 old_leaf_state = thr_bar->leaf_state; 1605 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, 1606 tid, team); 1607 // But if the entire team changes, we won't use oncore barrier at all 1608 if (team_change) 1609 old_leaf_kids = 0; 1610 1611 #if KMP_BARRIER_ICV_PUSH 1612 if (propagate_icvs) { 1613 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, 1614 FALSE); 1615 if (KMP_MASTER_TID( 1616 tid)) { // primary already has copy in final destination; copy 1617 copy_icvs(&thr_bar->th_fixed_icvs, 1618 &team->t.t_implicit_task_taskdata[tid].td_icvs); 1619 } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 1620 thr_bar->use_oncore_barrier) { // optimization for inf blocktime 1621 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0) 1622 // leaves (on-core children) pull parent's fixed ICVs directly to local 1623 // ICV store 1624 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1625 &thr_bar->parent_bar->th_fixed_icvs); 1626 // non-leaves will get ICVs piggybacked with b_go via NGO store 1627 } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs 1628 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can 1629 // access 1630 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs); 1631 else // leaves copy parent's fixed ICVs directly to local ICV store 1632 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1633 &thr_bar->parent_bar->th_fixed_icvs); 1634 } 1635 } 1636 #endif // KMP_BARRIER_ICV_PUSH 1637 1638 // Now, release my children 1639 if (thr_bar->my_level) { // not a leaf 1640 kmp_int32 child_tid; 1641 kmp_uint32 last; 1642 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && 1643 thr_bar->use_oncore_barrier) { 1644 if (KMP_MASTER_TID(tid)) { // do a flat release 1645 // Set local b_go to bump children via NGO store of the cache line 1646 // containing IVCs and b_go. 1647 thr_bar->b_go = KMP_BARRIER_STATE_BUMP; 1648 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of 1649 // the cache line 1650 ngo_load(&thr_bar->th_fixed_icvs); 1651 // This loops over all the threads skipping only the leaf nodes in the 1652 // hierarchy 1653 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc; 1654 child_tid += thr_bar->skip_per_level[1]) { 1655 kmp_bstate_t *child_bar = 1656 &team->t.t_threads[child_tid]->th.th_bar[bt].bb; 1657 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 1658 "releasing T#%d(%d:%d)" 1659 " go(%p): %u => %u\n", 1660 gtid, team->t.t_id, tid, 1661 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1662 child_tid, &child_bar->b_go, child_bar->b_go, 1663 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1664 // Use ngo store (if available) to both store ICVs and release child 1665 // via child's b_go 1666 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); 1667 } 1668 ngo_sync(); 1669 } 1670 TCW_8(thr_bar->b_go, 1671 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time 1672 // Now, release leaf children 1673 if (thr_bar->leaf_kids) { // if there are any 1674 // We test team_change on the off-chance that the level 1 team changed. 1675 if (team_change || 1676 old_leaf_kids < thr_bar->leaf_kids) { // some old, some new 1677 if (old_leaf_kids) { // release old leaf kids 1678 thr_bar->b_go |= old_leaf_state; 1679 } 1680 // Release new leaf kids 1681 last = tid + thr_bar->skip_per_level[1]; 1682 if (last > nproc) 1683 last = nproc; 1684 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last; 1685 ++child_tid) { // skip_per_level[0]=1 1686 kmp_info_t *child_thr = team->t.t_threads[child_tid]; 1687 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1688 KA_TRACE( 1689 20, 1690 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing" 1691 " T#%d(%d:%d) go(%p): %u => %u\n", 1692 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), 1693 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, 1694 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1695 // Release child using child's b_go flag 1696 kmp_flag_64<> flag(&child_bar->b_go, child_thr); 1697 flag.release(); 1698 } 1699 } else { // Release all children at once with leaf_state bits on my own 1700 // b_go flag 1701 thr_bar->b_go |= thr_bar->leaf_state; 1702 } 1703 } 1704 } else { // Blocktime is not infinite; do a simple hierarchical release 1705 for (int d = thr_bar->my_level - 1; d >= 0; 1706 --d) { // Release highest level threads first 1707 last = tid + thr_bar->skip_per_level[d + 1]; 1708 kmp_uint32 skip = thr_bar->skip_per_level[d]; 1709 if (last > nproc) 1710 last = nproc; 1711 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { 1712 kmp_info_t *child_thr = team->t.t_threads[child_tid]; 1713 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; 1714 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " 1715 "releasing T#%d(%d:%d) go(%p): %u => %u\n", 1716 gtid, team->t.t_id, tid, 1717 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, 1718 child_tid, &child_bar->b_go, child_bar->b_go, 1719 child_bar->b_go + KMP_BARRIER_STATE_BUMP)); 1720 // Release child using child's b_go flag 1721 kmp_flag_64<> flag(&child_bar->b_go, child_thr); 1722 flag.release(); 1723 } 1724 } 1725 } 1726 #if KMP_BARRIER_ICV_PUSH 1727 if (propagate_icvs && !KMP_MASTER_TID(tid)) 1728 // non-leaves copy ICVs from fixed ICVs to local dest 1729 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 1730 &thr_bar->th_fixed_icvs); 1731 #endif // KMP_BARRIER_ICV_PUSH 1732 } 1733 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for " 1734 "barrier type %d\n", 1735 gtid, team->t.t_id, tid, bt)); 1736 } 1737 1738 // End of Barrier Algorithms 1739 1740 // type traits for cancellable value 1741 // if cancellable is true, then is_cancellable is a normal boolean variable 1742 // if cancellable is false, then is_cancellable is a compile time constant 1743 template <bool cancellable> struct is_cancellable {}; 1744 template <> struct is_cancellable<true> { 1745 bool value; 1746 is_cancellable() : value(false) {} 1747 is_cancellable(bool b) : value(b) {} 1748 is_cancellable &operator=(bool b) { 1749 value = b; 1750 return *this; 1751 } 1752 operator bool() const { return value; } 1753 }; 1754 template <> struct is_cancellable<false> { 1755 is_cancellable &operator=(bool b) { return *this; } 1756 constexpr operator bool() const { return false; } 1757 }; 1758 1759 // Internal function to do a barrier. 1760 /* If is_split is true, do a split barrier, otherwise, do a plain barrier 1761 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split 1762 barrier 1763 When cancellable = false, 1764 Returns 0 if primary thread, 1 if worker thread. 1765 When cancellable = true 1766 Returns 0 if not cancelled, 1 if cancelled. */ 1767 template <bool cancellable = false> 1768 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split, 1769 size_t reduce_size, void *reduce_data, 1770 void (*reduce)(void *, void *)) { 1771 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier); 1772 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); 1773 int tid = __kmp_tid_from_gtid(gtid); 1774 kmp_info_t *this_thr = __kmp_threads[gtid]; 1775 kmp_team_t *team = this_thr->th.th_team; 1776 int status = 0; 1777 is_cancellable<cancellable> cancelled; 1778 #if OMPT_SUPPORT && OMPT_OPTIONAL 1779 ompt_data_t *my_task_data; 1780 ompt_data_t *my_parallel_data; 1781 void *return_address; 1782 ompt_sync_region_t barrier_kind; 1783 #endif 1784 1785 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid, 1786 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1787 1788 #if OMPT_SUPPORT 1789 if (ompt_enabled.enabled) { 1790 #if OMPT_OPTIONAL 1791 my_task_data = OMPT_CUR_TASK_DATA(this_thr); 1792 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); 1793 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1794 barrier_kind = __ompt_get_barrier_kind(bt, this_thr); 1795 if (ompt_enabled.ompt_callback_sync_region) { 1796 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1797 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data, 1798 return_address); 1799 } 1800 if (ompt_enabled.ompt_callback_sync_region_wait) { 1801 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1802 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data, 1803 return_address); 1804 } 1805 #endif 1806 // It is OK to report the barrier state after the barrier begin callback. 1807 // According to the OMPT specification, a compliant implementation may 1808 // even delay reporting this state until the barrier begins to wait. 1809 auto *ompt_thr_info = &this_thr->th.ompt_thread_info; 1810 switch (barrier_kind) { 1811 case ompt_sync_region_barrier_explicit: 1812 ompt_thr_info->state = ompt_state_wait_barrier_explicit; 1813 break; 1814 case ompt_sync_region_barrier_implicit_workshare: 1815 ompt_thr_info->state = ompt_state_wait_barrier_implicit_workshare; 1816 break; 1817 case ompt_sync_region_barrier_implicit_parallel: 1818 ompt_thr_info->state = ompt_state_wait_barrier_implicit_parallel; 1819 break; 1820 case ompt_sync_region_barrier_teams: 1821 ompt_thr_info->state = ompt_state_wait_barrier_teams; 1822 break; 1823 case ompt_sync_region_barrier_implementation: 1824 [[fallthrough]]; 1825 default: 1826 ompt_thr_info->state = ompt_state_wait_barrier_implementation; 1827 } 1828 } 1829 #endif 1830 1831 if (!team->t.t_serialized) { 1832 #if USE_ITT_BUILD 1833 // This value will be used in itt notify events below. 1834 void *itt_sync_obj = NULL; 1835 #if USE_ITT_NOTIFY 1836 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1837 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 1838 #endif 1839 #endif /* USE_ITT_BUILD */ 1840 if (__kmp_tasking_mode == tskm_extra_barrier) { 1841 __kmp_tasking_barrier(team, this_thr, gtid); 1842 KA_TRACE(15, 1843 ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid, 1844 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); 1845 } 1846 1847 /* Copy the blocktime info to the thread, where __kmp_wait_template() can 1848 access it when the team struct is not guaranteed to exist. */ 1849 // See note about the corresponding code in __kmp_join_barrier() being 1850 // performance-critical. 1851 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 1852 #if KMP_USE_MONITOR 1853 this_thr->th.th_team_bt_intervals = 1854 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 1855 this_thr->th.th_team_bt_set = 1856 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 1857 #else 1858 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 1859 #endif 1860 } 1861 1862 #if USE_ITT_BUILD 1863 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1864 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 1865 #endif /* USE_ITT_BUILD */ 1866 #if USE_DEBUGGER 1867 // Let the debugger know: the thread arrived to the barrier and waiting. 1868 if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct 1869 team->t.t_bar[bt].b_master_arrived += 1; 1870 } else { 1871 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1; 1872 } // if 1873 #endif /* USE_DEBUGGER */ 1874 if (reduce != NULL) { 1875 // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956 1876 this_thr->th.th_local.reduce_data = reduce_data; 1877 } 1878 1879 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec) 1880 __kmp_task_team_setup(this_thr, team); 1881 1882 if (cancellable) { 1883 cancelled = __kmp_linear_barrier_gather_cancellable( 1884 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1885 } else { 1886 switch (__kmp_barrier_gather_pattern[bt]) { 1887 case bp_dist_bar: { 1888 __kmp_dist_barrier_gather(bt, this_thr, gtid, tid, 1889 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1890 break; 1891 } 1892 case bp_hyper_bar: { 1893 // don't set branch bits to 0; use linear 1894 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); 1895 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, 1896 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1897 break; 1898 } 1899 case bp_hierarchical_bar: { 1900 __kmp_hierarchical_barrier_gather( 1901 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1902 break; 1903 } 1904 case bp_tree_bar: { 1905 // don't set branch bits to 0; use linear 1906 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); 1907 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, 1908 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1909 break; 1910 } 1911 default: { 1912 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, 1913 reduce USE_ITT_BUILD_ARG(itt_sync_obj)); 1914 } 1915 } 1916 } 1917 1918 KMP_MB(); 1919 1920 if (KMP_MASTER_TID(tid)) { 1921 status = 0; 1922 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) { 1923 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 1924 } 1925 #if USE_DEBUGGER 1926 // Let the debugger know: All threads are arrived and starting leaving the 1927 // barrier. 1928 team->t.t_bar[bt].b_team_arrived += 1; 1929 #endif 1930 1931 if (__kmp_omp_cancellation) { 1932 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request); 1933 // Reset cancellation flag for worksharing constructs 1934 if (cancel_request == cancel_loop || 1935 cancel_request == cancel_sections) { 1936 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq); 1937 } 1938 } 1939 #if USE_ITT_BUILD 1940 /* TODO: In case of split reduction barrier, primary thread may send 1941 acquired event early, before the final summation into the shared 1942 variable is done (final summation can be a long operation for array 1943 reductions). */ 1944 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1945 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1946 #endif /* USE_ITT_BUILD */ 1947 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1948 // Barrier - report frame end (only if active_level == 1) 1949 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 1950 __kmp_forkjoin_frames_mode && 1951 (this_thr->th.th_teams_microtask == NULL || // either not in teams 1952 this_thr->th.th_teams_size.nteams == 1) && // or inside single team 1953 team->t.t_active_level == 1) { 1954 ident_t *loc = __kmp_threads[gtid]->th.th_ident; 1955 kmp_uint64 cur_time = __itt_get_timestamp(); 1956 kmp_info_t **other_threads = team->t.t_threads; 1957 int nproc = this_thr->th.th_team_nproc; 1958 int i; 1959 switch (__kmp_forkjoin_frames_mode) { 1960 case 1: 1961 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1962 loc, nproc); 1963 this_thr->th.th_frame_time = cur_time; 1964 break; 1965 case 2: // AC 2015-01-19: currently does not work for hierarchical (to 1966 // be fixed) 1967 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1968 1, loc, nproc); 1969 break; 1970 case 3: 1971 if (__itt_metadata_add_ptr) { 1972 // Initialize with primary thread's wait time 1973 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 1974 // Set arrive time to zero to be able to check it in 1975 // __kmp_invoke_task(); the same is done inside the loop below 1976 this_thr->th.th_bar_arrive_time = 0; 1977 for (i = 1; i < nproc; ++i) { 1978 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); 1979 other_threads[i]->th.th_bar_arrive_time = 0; 1980 } 1981 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, 1982 cur_time, delta, 1983 (kmp_uint64)(reduce != NULL)); 1984 } 1985 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 1986 loc, nproc); 1987 this_thr->th.th_frame_time = cur_time; 1988 break; 1989 } 1990 } 1991 #endif /* USE_ITT_BUILD */ 1992 } else { 1993 status = 1; 1994 #if USE_ITT_BUILD 1995 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 1996 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 1997 #endif /* USE_ITT_BUILD */ 1998 } 1999 if ((status == 1 || !is_split) && !cancelled) { 2000 if (cancellable) { 2001 cancelled = __kmp_linear_barrier_release_cancellable( 2002 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 2003 } else { 2004 switch (__kmp_barrier_release_pattern[bt]) { 2005 case bp_dist_bar: { 2006 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 2007 __kmp_dist_barrier_release(bt, this_thr, gtid, tid, 2008 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 2009 break; 2010 } 2011 case bp_hyper_bar: { 2012 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 2013 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, 2014 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 2015 break; 2016 } 2017 case bp_hierarchical_bar: { 2018 __kmp_hierarchical_barrier_release( 2019 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 2020 break; 2021 } 2022 case bp_tree_bar: { 2023 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 2024 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, 2025 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 2026 break; 2027 } 2028 default: { 2029 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, 2030 FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); 2031 } 2032 } 2033 } 2034 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) { 2035 __kmp_task_team_sync(this_thr, team); 2036 } 2037 } 2038 2039 #if USE_ITT_BUILD 2040 /* GEH: TODO: Move this under if-condition above and also include in 2041 __kmp_end_split_barrier(). This will more accurately represent the actual 2042 release time of the threads for split barriers. */ 2043 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 2044 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 2045 #endif /* USE_ITT_BUILD */ 2046 } else { // Team is serialized. 2047 status = 0; 2048 if (__kmp_tasking_mode != tskm_immediate_exec) { 2049 if (this_thr->th.th_task_team != NULL) { 2050 #if USE_ITT_NOTIFY 2051 void *itt_sync_obj = NULL; 2052 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2053 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); 2054 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 2055 } 2056 #endif 2057 2058 KMP_DEBUG_ASSERT( 2059 this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE || 2060 this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered == 2061 TRUE); 2062 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 2063 __kmp_task_team_setup(this_thr, team); 2064 2065 #if USE_ITT_BUILD 2066 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 2067 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 2068 #endif /* USE_ITT_BUILD */ 2069 } 2070 } 2071 } 2072 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n", 2073 gtid, __kmp_team_from_gtid(gtid)->t.t_id, 2074 __kmp_tid_from_gtid(gtid), status)); 2075 2076 #if OMPT_SUPPORT 2077 if (ompt_enabled.enabled) { 2078 #if OMPT_OPTIONAL 2079 if (ompt_enabled.ompt_callback_sync_region_wait) { 2080 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2081 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data, 2082 return_address); 2083 } 2084 if (ompt_enabled.ompt_callback_sync_region) { 2085 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2086 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data, 2087 return_address); 2088 } 2089 #endif 2090 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 2091 } 2092 #endif 2093 2094 if (cancellable) 2095 return (int)cancelled; 2096 return status; 2097 } 2098 2099 // Returns 0 if primary thread, 1 if worker thread. 2100 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split, 2101 size_t reduce_size, void *reduce_data, 2102 void (*reduce)(void *, void *)) { 2103 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data, 2104 reduce); 2105 } 2106 2107 #if defined(KMP_GOMP_COMPAT) 2108 // Returns 1 if cancelled, 0 otherwise 2109 int __kmp_barrier_gomp_cancel(int gtid) { 2110 if (__kmp_omp_cancellation) { 2111 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE, 2112 0, NULL, NULL); 2113 if (cancelled) { 2114 int tid = __kmp_tid_from_gtid(gtid); 2115 kmp_info_t *this_thr = __kmp_threads[gtid]; 2116 if (KMP_MASTER_TID(tid)) { 2117 // Primary thread does not need to revert anything 2118 } else { 2119 // Workers need to revert their private b_arrived flag 2120 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -= 2121 KMP_BARRIER_STATE_BUMP; 2122 } 2123 } 2124 return cancelled; 2125 } 2126 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); 2127 return FALSE; 2128 } 2129 #endif 2130 2131 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) { 2132 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier); 2133 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); 2134 KMP_DEBUG_ASSERT(bt < bs_last_barrier); 2135 int tid = __kmp_tid_from_gtid(gtid); 2136 kmp_info_t *this_thr = __kmp_threads[gtid]; 2137 kmp_team_t *team = this_thr->th.th_team; 2138 2139 if (!team->t.t_serialized) { 2140 if (KMP_MASTER_GTID(gtid)) { 2141 switch (__kmp_barrier_release_pattern[bt]) { 2142 case bp_dist_bar: { 2143 __kmp_dist_barrier_release(bt, this_thr, gtid, tid, 2144 FALSE USE_ITT_BUILD_ARG(NULL)); 2145 break; 2146 } 2147 case bp_hyper_bar: { 2148 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 2149 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, 2150 FALSE USE_ITT_BUILD_ARG(NULL)); 2151 break; 2152 } 2153 case bp_hierarchical_bar: { 2154 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, 2155 FALSE USE_ITT_BUILD_ARG(NULL)); 2156 break; 2157 } 2158 case bp_tree_bar: { 2159 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); 2160 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, 2161 FALSE USE_ITT_BUILD_ARG(NULL)); 2162 break; 2163 } 2164 default: { 2165 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, 2166 FALSE USE_ITT_BUILD_ARG(NULL)); 2167 } 2168 } 2169 if (__kmp_tasking_mode != tskm_immediate_exec) { 2170 __kmp_task_team_sync(this_thr, team); 2171 } // if 2172 } 2173 } 2174 } 2175 2176 void __kmp_join_barrier(int gtid) { 2177 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier); 2178 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 2179 2180 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 2181 2182 kmp_info_t *this_thr = __kmp_threads[gtid]; 2183 kmp_team_t *team; 2184 int tid; 2185 #ifdef KMP_DEBUG 2186 int team_id; 2187 #endif /* KMP_DEBUG */ 2188 #if USE_ITT_BUILD 2189 void *itt_sync_obj = NULL; 2190 #if USE_ITT_NOTIFY 2191 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need 2192 // Get object created at fork_barrier 2193 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2194 #endif 2195 #endif /* USE_ITT_BUILD */ 2196 #if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG) 2197 int nproc = this_thr->th.th_team_nproc; 2198 #endif 2199 KMP_MB(); 2200 2201 // Get current info 2202 team = this_thr->th.th_team; 2203 KMP_DEBUG_ASSERT(nproc == team->t.t_nproc); 2204 tid = __kmp_tid_from_gtid(gtid); 2205 #ifdef KMP_DEBUG 2206 team_id = team->t.t_id; 2207 kmp_info_t *master_thread = this_thr->th.th_team_master; 2208 if (master_thread != team->t.t_threads[0]) { 2209 __kmp_print_structure(); 2210 } 2211 #endif /* KMP_DEBUG */ 2212 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]); 2213 KMP_MB(); 2214 2215 // Verify state 2216 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team)); 2217 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root)); 2218 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]); 2219 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", 2220 gtid, team_id, tid)); 2221 2222 #if OMPT_SUPPORT 2223 if (ompt_enabled.enabled) { 2224 #if OMPT_OPTIONAL 2225 ompt_data_t *my_task_data; 2226 ompt_data_t *my_parallel_data; 2227 void *codeptr = NULL; 2228 int ds_tid = this_thr->th.th_info.ds.ds_tid; 2229 if (KMP_MASTER_TID(ds_tid) && 2230 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 2231 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 2232 codeptr = team->t.ompt_team_info.master_return_address; 2233 my_task_data = OMPT_CUR_TASK_DATA(this_thr); 2234 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); 2235 ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel; 2236 ompt_state_t ompt_state = ompt_state_wait_barrier_implicit_parallel; 2237 if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) { 2238 sync_kind = ompt_sync_region_barrier_teams; 2239 ompt_state = ompt_state_wait_barrier_teams; 2240 } 2241 if (ompt_enabled.ompt_callback_sync_region) { 2242 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2243 sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr); 2244 } 2245 if (ompt_enabled.ompt_callback_sync_region_wait) { 2246 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2247 sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr); 2248 } 2249 if (!KMP_MASTER_TID(ds_tid)) 2250 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr); 2251 #endif 2252 this_thr->th.ompt_thread_info.state = ompt_state; 2253 } 2254 #endif 2255 2256 if (__kmp_tasking_mode == tskm_extra_barrier) { 2257 __kmp_tasking_barrier(team, this_thr, gtid); 2258 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n", 2259 gtid, team_id, tid)); 2260 } 2261 #ifdef KMP_DEBUG 2262 if (__kmp_tasking_mode != tskm_immediate_exec) { 2263 KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = " 2264 "%p, th_task_team = %p\n", 2265 __kmp_gtid_from_thread(this_thr), team_id, 2266 team->t.t_task_team[this_thr->th.th_task_state], 2267 this_thr->th.th_task_team)); 2268 KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr); 2269 } 2270 #endif /* KMP_DEBUG */ 2271 2272 /* Copy the blocktime info to the thread, where __kmp_wait_template() can 2273 access it when the team struct is not guaranteed to exist. Doing these 2274 loads causes a cache miss slows down EPCC parallel by 2x. As a workaround, 2275 we do not perform the copy if blocktime=infinite, since the values are not 2276 used by __kmp_wait_template() in that case. */ 2277 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 2278 #if KMP_USE_MONITOR 2279 this_thr->th.th_team_bt_intervals = 2280 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 2281 this_thr->th.th_team_bt_set = 2282 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 2283 #else 2284 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 2285 #endif 2286 } 2287 2288 #if USE_ITT_BUILD 2289 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 2290 __kmp_itt_barrier_starting(gtid, itt_sync_obj); 2291 #endif /* USE_ITT_BUILD */ 2292 2293 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) { 2294 case bp_dist_bar: { 2295 __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 2296 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 2297 break; 2298 } 2299 case bp_hyper_bar: { 2300 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 2301 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 2302 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 2303 break; 2304 } 2305 case bp_hierarchical_bar: { 2306 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 2307 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 2308 break; 2309 } 2310 case bp_tree_bar: { 2311 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); 2312 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 2313 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 2314 break; 2315 } 2316 default: { 2317 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, 2318 NULL USE_ITT_BUILD_ARG(itt_sync_obj)); 2319 } 2320 } 2321 2322 /* From this point on, the team data structure may be deallocated at any time 2323 by the primary thread - it is unsafe to reference it in any of the worker 2324 threads. Any per-team data items that need to be referenced before the 2325 end of the barrier should be moved to the kmp_task_team_t structs. */ 2326 if (KMP_MASTER_TID(tid)) { 2327 if (__kmp_tasking_mode != tskm_immediate_exec) { 2328 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); 2329 } 2330 if (__kmp_display_affinity) { 2331 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0); 2332 } 2333 #if KMP_STATS_ENABLED 2334 // Have primary thread flag the workers to indicate they are now waiting for 2335 // next parallel region, Also wake them up so they switch their timers to 2336 // idle. 2337 for (int i = 0; i < team->t.t_nproc; ++i) { 2338 kmp_info_t *team_thread = team->t.t_threads[i]; 2339 if (team_thread == this_thr) 2340 continue; 2341 team_thread->th.th_stats->setIdleFlag(); 2342 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && 2343 team_thread->th.th_sleep_loc != NULL) 2344 __kmp_null_resume_wrapper(team_thread); 2345 } 2346 #endif 2347 #if USE_ITT_BUILD 2348 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 2349 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 2350 #endif /* USE_ITT_BUILD */ 2351 2352 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2353 // Join barrier - report frame end 2354 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2355 __kmp_forkjoin_frames_mode && 2356 (this_thr->th.th_teams_microtask == NULL || // either not in teams 2357 this_thr->th.th_teams_size.nteams == 1) && // or inside single team 2358 team->t.t_active_level == 1) { 2359 kmp_uint64 cur_time = __itt_get_timestamp(); 2360 ident_t *loc = team->t.t_ident; 2361 kmp_info_t **other_threads = team->t.t_threads; 2362 switch (__kmp_forkjoin_frames_mode) { 2363 case 1: 2364 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 2365 loc, nproc); 2366 break; 2367 case 2: 2368 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, 2369 loc, nproc); 2370 break; 2371 case 3: 2372 if (__itt_metadata_add_ptr) { 2373 // Initialize with primary thread's wait time 2374 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; 2375 // Set arrive time to zero to be able to check it in 2376 // __kmp_invoke_task(); the same is done inside the loop below 2377 this_thr->th.th_bar_arrive_time = 0; 2378 for (int i = 1; i < nproc; ++i) { 2379 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); 2380 other_threads[i]->th.th_bar_arrive_time = 0; 2381 } 2382 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, 2383 cur_time, delta, 0); 2384 } 2385 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, 2386 loc, nproc); 2387 this_thr->th.th_frame_time = cur_time; 2388 break; 2389 } 2390 } 2391 #endif /* USE_ITT_BUILD */ 2392 } 2393 #if USE_ITT_BUILD 2394 else { 2395 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) 2396 __kmp_itt_barrier_middle(gtid, itt_sync_obj); 2397 } 2398 #endif /* USE_ITT_BUILD */ 2399 2400 #if KMP_DEBUG 2401 if (KMP_MASTER_TID(tid)) { 2402 KA_TRACE( 2403 15, 2404 ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n", 2405 gtid, team_id, tid, nproc)); 2406 } 2407 #endif /* KMP_DEBUG */ 2408 2409 // TODO now, mark worker threads as done so they may be disbanded 2410 KMP_MB(); // Flush all pending memory write invalidates. 2411 KA_TRACE(10, 2412 ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid)); 2413 2414 } 2415 2416 // TODO release worker threads' fork barriers as we are ready instead of all at 2417 // once 2418 void __kmp_fork_barrier(int gtid, int tid) { 2419 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier); 2420 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); 2421 kmp_info_t *this_thr = __kmp_threads[gtid]; 2422 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL; 2423 #if USE_ITT_BUILD 2424 void *itt_sync_obj = NULL; 2425 #endif /* USE_ITT_BUILD */ 2426 #ifdef KMP_DEBUG 2427 if (team) 2428 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid, 2429 (team != NULL) ? team->t.t_id : -1, tid)); 2430 #endif 2431 // th_team pointer only valid for primary thread here 2432 if (KMP_MASTER_TID(tid)) { 2433 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2434 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2435 // Create itt barrier object 2436 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1); 2437 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing 2438 } 2439 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2440 2441 #ifdef KMP_DEBUG 2442 KMP_DEBUG_ASSERT(team); 2443 kmp_info_t **other_threads = team->t.t_threads; 2444 int i; 2445 2446 // Verify state 2447 KMP_MB(); 2448 2449 for (i = 1; i < team->t.t_nproc; ++i) { 2450 KA_TRACE(500, 2451 ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go " 2452 "== %u.\n", 2453 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid, 2454 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid, 2455 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)); 2456 KMP_DEBUG_ASSERT( 2457 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) & 2458 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE); 2459 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team); 2460 } 2461 #endif 2462 2463 if (__kmp_tasking_mode != tskm_immediate_exec) 2464 __kmp_task_team_setup(this_thr, team); 2465 2466 /* The primary thread may have changed its blocktime between join barrier 2467 and fork barrier. Copy the blocktime info to the thread, where 2468 __kmp_wait_template() can access it when the team struct is not 2469 guaranteed to exist. */ 2470 // See note about the corresponding code in __kmp_join_barrier() being 2471 // performance-critical 2472 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 2473 #if KMP_USE_MONITOR 2474 this_thr->th.th_team_bt_intervals = 2475 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; 2476 this_thr->th.th_team_bt_set = 2477 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; 2478 #else 2479 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); 2480 #endif 2481 } 2482 } // primary thread 2483 2484 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) { 2485 case bp_dist_bar: { 2486 __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 2487 TRUE USE_ITT_BUILD_ARG(NULL)); 2488 break; 2489 } 2490 case bp_hyper_bar: { 2491 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 2492 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 2493 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 2494 break; 2495 } 2496 case bp_hierarchical_bar: { 2497 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 2498 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 2499 break; 2500 } 2501 case bp_tree_bar: { 2502 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); 2503 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 2504 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 2505 break; 2506 } 2507 default: { 2508 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, 2509 TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 2510 } 2511 } 2512 2513 #if OMPT_SUPPORT 2514 ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state; 2515 if (ompt_enabled.enabled && 2516 (ompt_state == ompt_state_wait_barrier_teams || 2517 ompt_state == ompt_state_wait_barrier_implicit_parallel)) { 2518 int ds_tid = this_thr->th.th_info.ds.ds_tid; 2519 ompt_data_t *task_data = (team) 2520 ? OMPT_CUR_TASK_DATA(this_thr) 2521 : &(this_thr->th.ompt_thread_info.task_data); 2522 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 2523 #if OMPT_OPTIONAL 2524 void *codeptr = NULL; 2525 if (KMP_MASTER_TID(ds_tid) && 2526 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 2527 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 2528 codeptr = team ? team->t.ompt_team_info.master_return_address : NULL; 2529 ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel; 2530 if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) 2531 sync_kind = ompt_sync_region_barrier_teams; 2532 if (ompt_enabled.ompt_callback_sync_region_wait) { 2533 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2534 sync_kind, ompt_scope_end, NULL, task_data, codeptr); 2535 } 2536 if (ompt_enabled.ompt_callback_sync_region) { 2537 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2538 sync_kind, ompt_scope_end, NULL, task_data, codeptr); 2539 } 2540 #endif 2541 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 2542 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2543 ompt_scope_end, NULL, task_data, 0, ds_tid, 2544 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 2545 } 2546 } 2547 #endif 2548 2549 // Early exit for reaping threads releasing forkjoin barrier 2550 if (TCR_4(__kmp_global.g.g_done)) { 2551 this_thr->th.th_task_team = NULL; 2552 2553 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2554 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2555 if (!KMP_MASTER_TID(tid)) { 2556 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2557 if (itt_sync_obj) 2558 __kmp_itt_barrier_finished(gtid, itt_sync_obj); 2559 } 2560 } 2561 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2562 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid)); 2563 return; 2564 } 2565 2566 /* We can now assume that a valid team structure has been allocated by the 2567 primary thread and propagated to all worker threads. The current thread, 2568 however, may not be part of the team, so we can't blindly assume that the 2569 team pointer is non-null. */ 2570 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team); 2571 KMP_DEBUG_ASSERT(team != NULL); 2572 tid = __kmp_tid_from_gtid(gtid); 2573 2574 #if KMP_BARRIER_ICV_PULL 2575 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in 2576 __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's 2577 implicit task has this data before this function is called. We cannot 2578 modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's 2579 thread struct, because it is not always the case that the threads arrays 2580 have been allocated when __kmp_fork_call() is executed. */ 2581 { 2582 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); 2583 if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs 2584 // Copy the initial ICVs from the primary thread's thread struct to the 2585 // implicit task for this tid. 2586 KA_TRACE(10, 2587 ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid)); 2588 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, 2589 tid, FALSE); 2590 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, 2591 &team->t.t_threads[0] 2592 ->th.th_bar[bs_forkjoin_barrier] 2593 .bb.th_fixed_icvs); 2594 } 2595 } 2596 #endif // KMP_BARRIER_ICV_PULL 2597 2598 if (__kmp_tasking_mode != tskm_immediate_exec) { 2599 __kmp_task_team_sync(this_thr, team); 2600 } 2601 2602 #if KMP_AFFINITY_SUPPORTED 2603 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 2604 if (proc_bind == proc_bind_intel) { 2605 // Call dynamic affinity settings 2606 if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) { 2607 __kmp_balanced_affinity(this_thr, team->t.t_nproc); 2608 } 2609 } else if (proc_bind != proc_bind_false) { 2610 if (this_thr->th.th_new_place == this_thr->th.th_current_place) { 2611 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n", 2612 __kmp_gtid_from_thread(this_thr), 2613 this_thr->th.th_current_place)); 2614 } else { 2615 __kmp_affinity_bind_place(gtid); 2616 } 2617 } 2618 #endif // KMP_AFFINITY_SUPPORTED 2619 // Perform the display affinity functionality 2620 if (__kmp_display_affinity) { 2621 if (team->t.t_display_affinity 2622 #if KMP_AFFINITY_SUPPORTED 2623 || (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) 2624 #endif 2625 ) { 2626 // NULL means use the affinity-format-var ICV 2627 __kmp_aux_display_affinity(gtid, NULL); 2628 this_thr->th.th_prev_num_threads = team->t.t_nproc; 2629 this_thr->th.th_prev_level = team->t.t_level; 2630 } 2631 } 2632 if (!KMP_MASTER_TID(tid)) 2633 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator); 2634 2635 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2636 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2637 if (!KMP_MASTER_TID(tid)) { 2638 // Get correct barrier object 2639 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2640 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired 2641 } // (prepare called inside barrier_release) 2642 } 2643 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2644 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, 2645 team->t.t_id, tid)); 2646 } 2647 2648 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 2649 kmp_internal_control_t *new_icvs, ident_t *loc) { 2650 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy); 2651 2652 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs); 2653 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 2654 2655 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in 2656 __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's 2657 implicit task has this data before this function is called. */ 2658 #if KMP_BARRIER_ICV_PULL 2659 /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which 2660 remains untouched), where all of the worker threads can access them and 2661 make their own copies after the barrier. */ 2662 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be 2663 // allocated at this point 2664 copy_icvs( 2665 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, 2666 new_icvs); 2667 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0, 2668 team->t.t_threads[0], team)); 2669 #elif KMP_BARRIER_ICV_PUSH 2670 // The ICVs will be propagated in the fork barrier, so nothing needs to be 2671 // done here. 2672 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0, 2673 team->t.t_threads[0], team)); 2674 #else 2675 // Copy the ICVs to each of the non-primary threads. This takes O(nthreads) 2676 // time. 2677 ngo_load(new_icvs); 2678 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be 2679 // allocated at this point 2680 for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread 2681 // TODO: GEH - pass in better source location info since usually NULL here 2682 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 2683 f, team->t.t_threads[f], team)); 2684 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE); 2685 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs); 2686 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", 2687 f, team->t.t_threads[f], team)); 2688 } 2689 ngo_sync(); 2690 #endif // KMP_BARRIER_ICV_PULL 2691 } 2692