xref: /llvm-project/openmp/runtime/src/kmp_barrier.cpp (revision 598970904736f3535939f6a5525022219e4ae517)
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp_wait_release.h"
14 #include "kmp_barrier.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 // for distributed barrier
20 #include "kmp_affinity.h"
21 
22 #if KMP_MIC
23 #include <immintrin.h>
24 #define USE_NGO_STORES 1
25 #endif // KMP_MIC
26 
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
39 
40 void __kmp_print_structure(void); // Forward declaration
41 
42 // ---------------------------- Barrier Algorithms ----------------------------
43 // Distributed barrier
44 
45 // Compute how many threads to have polling each cache-line.
46 // We want to limit the number of writes to IDEAL_GO_RESOLUTION.
47 void distributedBarrier::computeVarsForN(size_t n) {
48   int nsockets = 1;
49   if (__kmp_topology) {
50     int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);
51     int core_level = __kmp_topology->get_level(KMP_HW_CORE);
52     int ncores_per_socket =
53         __kmp_topology->calculate_ratio(core_level, socket_level);
54     nsockets = __kmp_topology->get_count(socket_level);
55 
56     if (nsockets <= 0)
57       nsockets = 1;
58     if (ncores_per_socket <= 0)
59       ncores_per_socket = 1;
60 
61     threads_per_go = ncores_per_socket >> 1;
62     if (!fix_threads_per_go) {
63       // Minimize num_gos
64       if (threads_per_go > 4) {
65         if (KMP_OPTIMIZE_FOR_REDUCTIONS) {
66           threads_per_go = threads_per_go >> 1;
67         }
68         if (threads_per_go > 4 && nsockets == 1)
69           threads_per_go = threads_per_go >> 1;
70       }
71     }
72     if (threads_per_go == 0)
73       threads_per_go = 1;
74     fix_threads_per_go = true;
75     num_gos = n / threads_per_go;
76     if (n % threads_per_go)
77       num_gos++;
78     if (nsockets == 1 || num_gos == 1)
79       num_groups = 1;
80     else {
81       num_groups = num_gos / nsockets;
82       if (num_gos % nsockets)
83         num_groups++;
84     }
85     if (num_groups <= 0)
86       num_groups = 1;
87     gos_per_group = num_gos / num_groups;
88     if (num_gos % num_groups)
89       gos_per_group++;
90     threads_per_group = threads_per_go * gos_per_group;
91   } else {
92     num_gos = n / threads_per_go;
93     if (n % threads_per_go)
94       num_gos++;
95     if (num_gos == 1)
96       num_groups = 1;
97     else {
98       num_groups = num_gos / 2;
99       if (num_gos % 2)
100         num_groups++;
101     }
102     gos_per_group = num_gos / num_groups;
103     if (num_gos % num_groups)
104       gos_per_group++;
105     threads_per_group = threads_per_go * gos_per_group;
106   }
107 }
108 
109 void distributedBarrier::computeGo(size_t n) {
110   // Minimize num_gos
111   for (num_gos = 1;; num_gos++)
112     if (IDEAL_CONTENTION * num_gos >= n)
113       break;
114   threads_per_go = n / num_gos;
115   if (n % num_gos)
116     threads_per_go++;
117   while (num_gos > MAX_GOS) {
118     threads_per_go++;
119     num_gos = n / threads_per_go;
120     if (n % threads_per_go)
121       num_gos++;
122   }
123   computeVarsForN(n);
124 }
125 
126 // This function is to resize the barrier arrays when the new number of threads
127 // exceeds max_threads, which is the current size of all the arrays
128 void distributedBarrier::resize(size_t nthr) {
129   KMP_DEBUG_ASSERT(nthr > max_threads);
130 
131   // expand to requested size * 2
132   max_threads = nthr * 2;
133 
134   // allocate arrays to new max threads
135   for (int i = 0; i < MAX_ITERS; ++i) {
136     if (flags[i])
137       flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
138                                                  max_threads * sizeof(flags_s));
139     else
140       flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s));
141   }
142 
143   if (go)
144     go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s));
145   else
146     go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s));
147 
148   if (iter)
149     iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s));
150   else
151     iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s));
152 
153   if (sleep)
154     sleep =
155         (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s));
156   else
157     sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s));
158 }
159 
160 // This function is to set all the go flags that threads might be waiting
161 // on, and when blocktime is not infinite, it should be followed by a wake-up
162 // call to each thread
163 kmp_uint64 distributedBarrier::go_release() {
164   kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS;
165   for (size_t j = 0; j < num_gos; j++) {
166     go[j].go.store(next_go);
167   }
168   return next_go;
169 }
170 
171 void distributedBarrier::go_reset() {
172   for (size_t j = 0; j < max_threads; ++j) {
173     for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {
174       flags[i][j].stillNeed = 1;
175     }
176     go[j].go.store(0);
177     iter[j].iter = 0;
178   }
179 }
180 
181 // This function inits/re-inits the distributed barrier for a particular number
182 // of threads. If a resize of arrays is needed, it calls the resize function.
183 void distributedBarrier::init(size_t nthr) {
184   size_t old_max = max_threads;
185   if (nthr > max_threads) { // need more space in arrays
186     resize(nthr);
187   }
188 
189   for (size_t i = 0; i < max_threads; i++) {
190     for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {
191       flags[j][i].stillNeed = 1;
192     }
193     go[i].go.store(0);
194     iter[i].iter = 0;
195     if (i >= old_max)
196       sleep[i].sleep = false;
197   }
198 
199   // Recalculate num_gos, etc. based on new nthr
200   computeVarsForN(nthr);
201 
202   num_threads = nthr;
203 
204   if (team_icvs == NULL)
205     team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t));
206 }
207 
208 // This function is used only when KMP_BLOCKTIME is not infinite.
209 // static
210 void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team,
211                                size_t start, size_t stop, size_t inc,
212                                size_t tid) {
213   KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME);
214   if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
215     return;
216 
217   kmp_info_t **other_threads = team->t.t_threads;
218   for (size_t thr = start; thr < stop; thr += inc) {
219     KMP_DEBUG_ASSERT(other_threads[thr]);
220     int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
221     // Wake up worker regardless of if it appears to be sleeping or not
222     __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL);
223   }
224 }
225 
226 static void __kmp_dist_barrier_gather(
227     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
228     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
229   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather);
230   kmp_team_t *team;
231   distributedBarrier *b;
232   kmp_info_t **other_threads;
233   kmp_uint64 my_current_iter, my_next_iter;
234   kmp_uint32 nproc;
235   bool group_leader;
236 
237   team = this_thr->th.th_team;
238   nproc = this_thr->th.th_team_nproc;
239   other_threads = team->t.t_threads;
240   b = team->t.b;
241   my_current_iter = b->iter[tid].iter;
242   my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;
243   group_leader = ((tid % b->threads_per_group) == 0);
244 
245   KA_TRACE(20,
246            ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
247             gtid, team->t.t_id, tid, bt));
248 
249 #if USE_ITT_BUILD && USE_ITT_NOTIFY
250   // Barrier imbalance - save arrive time to the thread
251   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
252     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
253         __itt_get_timestamp();
254   }
255 #endif
256 
257   if (group_leader) {
258     // Start from the thread after the group leader
259     size_t group_start = tid + 1;
260     size_t group_end = tid + b->threads_per_group;
261     size_t threads_pending = 0;
262 
263     if (group_end > nproc)
264       group_end = nproc;
265     do { // wait for threads in my group
266       threads_pending = 0;
267       // Check all the flags every time to avoid branch misspredict
268       for (size_t thr = group_start; thr < group_end; thr++) {
269         // Each thread uses a different cache line
270         threads_pending += b->flags[my_current_iter][thr].stillNeed;
271       }
272       // Execute tasks here
273       if (__kmp_tasking_mode != tskm_immediate_exec) {
274         kmp_task_team_t *task_team = this_thr->th.th_task_team;
275         if (task_team != NULL) {
276           if (TCR_SYNC_4(task_team->tt.tt_active)) {
277             if (KMP_TASKING_ENABLED(task_team)) {
278               int tasks_completed = FALSE;
279               __kmp_atomic_execute_tasks_64(
280                   this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
281                   &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
282             } else
283               this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
284           }
285         } else {
286           this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
287         } // if
288       }
289       if (TCR_4(__kmp_global.g.g_done)) {
290         if (__kmp_global.g.g_abort)
291           __kmp_abort_thread();
292         break;
293       } else if (__kmp_tasking_mode != tskm_immediate_exec &&
294                  this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
295         this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
296       }
297     } while (threads_pending > 0);
298 
299     if (reduce) { // Perform reduction if needed
300       OMPT_REDUCTION_DECL(this_thr, gtid);
301       OMPT_REDUCTION_BEGIN;
302       // Group leader reduces all threads in group
303       for (size_t thr = group_start; thr < group_end; thr++) {
304         (*reduce)(this_thr->th.th_local.reduce_data,
305                   other_threads[thr]->th.th_local.reduce_data);
306       }
307       OMPT_REDUCTION_END;
308     }
309 
310     // Set flag for next iteration
311     b->flags[my_next_iter][tid].stillNeed = 1;
312     // Each thread uses a different cache line; resets stillNeed to 0 to
313     // indicate it has reached the barrier
314     b->flags[my_current_iter][tid].stillNeed = 0;
315 
316     do { // wait for all group leaders
317       threads_pending = 0;
318       for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) {
319         threads_pending += b->flags[my_current_iter][thr].stillNeed;
320       }
321       // Execute tasks here
322       if (__kmp_tasking_mode != tskm_immediate_exec) {
323         kmp_task_team_t *task_team = this_thr->th.th_task_team;
324         if (task_team != NULL) {
325           if (TCR_SYNC_4(task_team->tt.tt_active)) {
326             if (KMP_TASKING_ENABLED(task_team)) {
327               int tasks_completed = FALSE;
328               __kmp_atomic_execute_tasks_64(
329                   this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
330                   &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
331             } else
332               this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
333           }
334         } else {
335           this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
336         } // if
337       }
338       if (TCR_4(__kmp_global.g.g_done)) {
339         if (__kmp_global.g.g_abort)
340           __kmp_abort_thread();
341         break;
342       } else if (__kmp_tasking_mode != tskm_immediate_exec &&
343                  this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
344         this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
345       }
346     } while (threads_pending > 0);
347 
348     if (reduce) { // Perform reduction if needed
349       if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders
350         OMPT_REDUCTION_DECL(this_thr, gtid);
351         OMPT_REDUCTION_BEGIN;
352         for (size_t thr = b->threads_per_group; thr < nproc;
353              thr += b->threads_per_group) {
354           (*reduce)(this_thr->th.th_local.reduce_data,
355                     other_threads[thr]->th.th_local.reduce_data);
356         }
357         OMPT_REDUCTION_END;
358       }
359     }
360   } else {
361     // Set flag for next iteration
362     b->flags[my_next_iter][tid].stillNeed = 1;
363     // Each thread uses a different cache line; resets stillNeed to 0 to
364     // indicate it has reached the barrier
365     b->flags[my_current_iter][tid].stillNeed = 0;
366   }
367 
368   KMP_MFENCE();
369 
370   KA_TRACE(20,
371            ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
372             gtid, team->t.t_id, tid, bt));
373 }
374 
375 static void __kmp_dist_barrier_release(
376     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
377     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
378   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
379   kmp_team_t *team;
380   distributedBarrier *b;
381   kmp_bstate_t *thr_bar;
382   kmp_uint64 my_current_iter, next_go;
383   size_t my_go_index;
384   bool group_leader;
385 
386   KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
387                 gtid, tid, bt));
388 
389   thr_bar = &this_thr->th.th_bar[bt].bb;
390 
391   if (!KMP_MASTER_TID(tid)) {
392     // workers and non-master group leaders need to check their presence in team
393     do {
394       if (this_thr->th.th_used_in_team.load() != 1 &&
395           this_thr->th.th_used_in_team.load() != 3) {
396         // Thread is not in use in a team. Wait on location in tid's thread
397         // struct. The 0 value tells anyone looking that this thread is spinning
398         // or sleeping until this location becomes 3 again; 3 is the transition
399         // state to get to 1 which is waiting on go and being in the team
400         kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
401         if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
402                                         0) ||
403             this_thr->th.th_used_in_team.load() == 0) {
404           my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
405         }
406 #if USE_ITT_BUILD && USE_ITT_NOTIFY
407         if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
408           // In fork barrier where we could not get the object reliably
409           itt_sync_obj =
410               __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
411           // Cancel wait on previous parallel region...
412           __kmp_itt_task_starting(itt_sync_obj);
413 
414           if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
415             return;
416 
417           itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
418           if (itt_sync_obj != NULL)
419             // Call prepare as early as possible for "new" barrier
420             __kmp_itt_task_finished(itt_sync_obj);
421         } else
422 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
423             if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
424           return;
425       }
426       if (this_thr->th.th_used_in_team.load() != 1 &&
427           this_thr->th.th_used_in_team.load() != 3) // spurious wake-up?
428         continue;
429       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
430         return;
431 
432       // At this point, the thread thinks it is in use in a team, or in
433       // transition to be used in a team, but it might have reached this barrier
434       // before it was marked unused by the team. Unused threads are awoken and
435       // shifted to wait on local thread struct elsewhere. It also might reach
436       // this point by being picked up for use by a different team. Either way,
437       // we need to update the tid.
438       tid = __kmp_tid_from_gtid(gtid);
439       team = this_thr->th.th_team;
440       KMP_DEBUG_ASSERT(tid >= 0);
441       KMP_DEBUG_ASSERT(team);
442       b = team->t.b;
443       my_current_iter = b->iter[tid].iter;
444       next_go = my_current_iter + distributedBarrier::MAX_ITERS;
445       my_go_index = tid / b->threads_per_go;
446       if (this_thr->th.th_used_in_team.load() == 3) {
447         (void)KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3,
448                                           1);
449       }
450       // Check if go flag is set
451       if (b->go[my_go_index].go.load() != next_go) {
452         // Wait on go flag on team
453         kmp_atomic_flag_64<false, true> my_flag(
454             &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
455         my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
456         KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||
457                          b->iter[tid].iter == 0);
458         KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false);
459       }
460 
461       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
462         return;
463       // At this point, the thread's go location was set. This means the primary
464       // thread is safely in the barrier, and so this thread's data is
465       // up-to-date, but we should check again that this thread is really in
466       // use in the team, as it could have been woken up for the purpose of
467       // changing team size, or reaping threads at shutdown.
468       if (this_thr->th.th_used_in_team.load() == 1)
469         break;
470     } while (1);
471 
472     if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
473       return;
474 
475     group_leader = ((tid % b->threads_per_group) == 0);
476     if (group_leader) {
477       // Tell all the threads in my group they can go!
478       for (size_t go_idx = my_go_index + 1;
479            go_idx < my_go_index + b->gos_per_group; go_idx++) {
480         b->go[go_idx].go.store(next_go);
481       }
482       // Fence added so that workers can see changes to go. sfence inadequate.
483       KMP_MFENCE();
484     }
485 
486 #if KMP_BARRIER_ICV_PUSH
487     if (propagate_icvs) { // copy ICVs to final dest
488       __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
489                                tid, FALSE);
490       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
491                 (kmp_internal_control_t *)team->t.b->team_icvs);
492       copy_icvs(&thr_bar->th_fixed_icvs,
493                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
494     }
495 #endif
496     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
497       // This thread is now awake and participating in the barrier;
498       // wake up the other threads in the group
499       size_t nproc = this_thr->th.th_team_nproc;
500       size_t group_end = tid + b->threads_per_group;
501       if (nproc < group_end)
502         group_end = nproc;
503       __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
504     }
505   } else { //  Primary thread
506     team = this_thr->th.th_team;
507     b = team->t.b;
508     my_current_iter = b->iter[tid].iter;
509     next_go = my_current_iter + distributedBarrier::MAX_ITERS;
510 #if KMP_BARRIER_ICV_PUSH
511     if (propagate_icvs) {
512       // primary thread has ICVs in final destination; copy
513       copy_icvs(&thr_bar->th_fixed_icvs,
514                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
515     }
516 #endif
517     // Tell all the group leaders they can go!
518     for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {
519       b->go[go_idx].go.store(next_go);
520     }
521 
522     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
523       // Wake-up the group leaders
524       size_t nproc = this_thr->th.th_team_nproc;
525       __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,
526                                 b->threads_per_group, tid);
527     }
528 
529     // Tell all the threads in my group they can go!
530     for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {
531       b->go[go_idx].go.store(next_go);
532     }
533 
534     // Fence added so that workers can see changes to go. sfence inadequate.
535     KMP_MFENCE();
536 
537     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
538       // Wake-up the other threads in my group
539       size_t nproc = this_thr->th.th_team_nproc;
540       size_t group_end = tid + b->threads_per_group;
541       if (nproc < group_end)
542         group_end = nproc;
543       __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
544     }
545   }
546   // Update to next iteration
547   KMP_ASSERT(my_current_iter == b->iter[tid].iter);
548   b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;
549 
550   KA_TRACE(
551       20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
552            gtid, team->t.t_id, tid, bt));
553 }
554 
555 // Linear Barrier
556 template <bool cancellable = false>
557 static bool __kmp_linear_barrier_gather_template(
558     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
559     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
560   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
561   kmp_team_t *team = this_thr->th.th_team;
562   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
563   kmp_info_t **other_threads = team->t.t_threads;
564 
565   KA_TRACE(
566       20,
567       ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
568        gtid, team->t.t_id, tid, bt));
569   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
570 
571 #if USE_ITT_BUILD && USE_ITT_NOTIFY
572   // Barrier imbalance - save arrive time to the thread
573   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
574     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
575         __itt_get_timestamp();
576   }
577 #endif
578   // We now perform a linear reduction to signal that all of the threads have
579   // arrived.
580   if (!KMP_MASTER_TID(tid)) {
581     KA_TRACE(20,
582              ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
583               "arrived(%p): %llu => %llu\n",
584               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
585               team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
586               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
587     // Mark arrival to primary thread
588     /* After performing this write, a worker thread may not assume that the team
589        is valid any more - it could be deallocated by the primary thread at any
590        time. */
591     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
592     flag.release();
593   } else {
594     kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
595     int nproc = this_thr->th.th_team_nproc;
596     int i;
597     // Don't have to worry about sleep bit here or atomic since team setting
598     kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
599 
600     // Collect all the worker team member threads.
601     for (i = 1; i < nproc; ++i) {
602 #if KMP_CACHE_MANAGE
603       // Prefetch next thread's arrived count
604       if (i + 1 < nproc)
605         KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
606 #endif /* KMP_CACHE_MANAGE */
607       KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
608                     "arrived(%p) == %llu\n",
609                     gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
610                     team->t.t_id, i,
611                     &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
612 
613       // Wait for worker thread to arrive
614       if (cancellable) {
615         kmp_flag_64<true, false> flag(
616             &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
617         if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
618           return true;
619       } else {
620         kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
621                            new_state);
622         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
623       }
624 #if USE_ITT_BUILD && USE_ITT_NOTIFY
625       // Barrier imbalance - write min of the thread time and the other thread
626       // time to the thread.
627       if (__kmp_forkjoin_frames_mode == 2) {
628         this_thr->th.th_bar_min_time = KMP_MIN(
629             this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
630       }
631 #endif
632       if (reduce) {
633         KA_TRACE(100,
634                  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
635                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
636                   team->t.t_id, i));
637         OMPT_REDUCTION_DECL(this_thr, gtid);
638         OMPT_REDUCTION_BEGIN;
639         (*reduce)(this_thr->th.th_local.reduce_data,
640                   other_threads[i]->th.th_local.reduce_data);
641         OMPT_REDUCTION_END;
642       }
643     }
644     // Don't have to worry about sleep bit here or atomic since team setting
645     team_bar->b_arrived = new_state;
646     KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
647                   "arrived(%p) = %llu\n",
648                   gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
649                   new_state));
650   }
651   KA_TRACE(
652       20,
653       ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
654        gtid, team->t.t_id, tid, bt));
655   return false;
656 }
657 
658 template <bool cancellable = false>
659 static bool __kmp_linear_barrier_release_template(
660     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
661     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
662   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
663   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
664   kmp_team_t *team;
665 
666   if (KMP_MASTER_TID(tid)) {
667     unsigned int i;
668     kmp_uint32 nproc = this_thr->th.th_team_nproc;
669     kmp_info_t **other_threads;
670 
671     team = __kmp_threads[gtid]->th.th_team;
672     KMP_DEBUG_ASSERT(team != NULL);
673     other_threads = team->t.t_threads;
674 
675     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
676                   "barrier type %d\n",
677                   gtid, team->t.t_id, tid, bt));
678 
679     if (nproc > 1) {
680 #if KMP_BARRIER_ICV_PUSH
681       {
682         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
683         if (propagate_icvs) {
684           ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
685           for (i = 1; i < nproc; ++i) {
686             __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
687                                      team, i, FALSE);
688             ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
689                            &team->t.t_implicit_task_taskdata[0].td_icvs);
690           }
691           ngo_sync();
692         }
693       }
694 #endif // KMP_BARRIER_ICV_PUSH
695 
696       // Now, release all of the worker threads
697       for (i = 1; i < nproc; ++i) {
698 #if KMP_CACHE_MANAGE
699         // Prefetch next thread's go flag
700         if (i + 1 < nproc)
701           KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
702 #endif /* KMP_CACHE_MANAGE */
703         KA_TRACE(
704             20,
705             ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
706              "go(%p): %u => %u\n",
707              gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
708              team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
709              other_threads[i]->th.th_bar[bt].bb.b_go,
710              other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
711         kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
712                            other_threads[i]);
713         flag.release();
714       }
715     }
716   } else { // Wait for the PRIMARY thread to release us
717     KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
718                   gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
719     if (cancellable) {
720       kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
721       if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
722         return true;
723     } else {
724       kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
725       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
726     }
727 #if USE_ITT_BUILD && USE_ITT_NOTIFY
728     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
729       // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
730       // disabled)
731       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
732       // Cancel wait on previous parallel region...
733       __kmp_itt_task_starting(itt_sync_obj);
734 
735       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
736         return false;
737 
738       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
739       if (itt_sync_obj != NULL)
740         // Call prepare as early as possible for "new" barrier
741         __kmp_itt_task_finished(itt_sync_obj);
742     } else
743 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
744         // Early exit for reaping threads releasing forkjoin barrier
745         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
746       return false;
747 // The worker thread may now assume that the team is valid.
748 #ifdef KMP_DEBUG
749     tid = __kmp_tid_from_gtid(gtid);
750     team = __kmp_threads[gtid]->th.th_team;
751 #endif
752     KMP_DEBUG_ASSERT(team != NULL);
753     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
754     KA_TRACE(20,
755              ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
756               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
757     KMP_MB(); // Flush all pending memory write invalidates.
758   }
759   KA_TRACE(
760       20,
761       ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
762        gtid, team->t.t_id, tid, bt));
763   return false;
764 }
765 
766 static void __kmp_linear_barrier_gather(
767     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
768     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
769   __kmp_linear_barrier_gather_template<false>(
770       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
771 }
772 
773 static bool __kmp_linear_barrier_gather_cancellable(
774     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
775     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
776   return __kmp_linear_barrier_gather_template<true>(
777       bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
778 }
779 
780 static void __kmp_linear_barrier_release(
781     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
782     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
783   __kmp_linear_barrier_release_template<false>(
784       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
785 }
786 
787 static bool __kmp_linear_barrier_release_cancellable(
788     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
789     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
790   return __kmp_linear_barrier_release_template<true>(
791       bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
792 }
793 
794 // Tree barrier
795 static void __kmp_tree_barrier_gather(
796     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
797     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
798   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
799   kmp_team_t *team = this_thr->th.th_team;
800   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
801   kmp_info_t **other_threads = team->t.t_threads;
802   kmp_uint32 nproc = this_thr->th.th_team_nproc;
803   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
804   kmp_uint32 branch_factor = 1 << branch_bits;
805   kmp_uint32 child;
806   kmp_uint32 child_tid;
807   kmp_uint64 new_state = 0;
808 
809   KA_TRACE(
810       20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
811            gtid, team->t.t_id, tid, bt));
812   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
813 
814 #if USE_ITT_BUILD && USE_ITT_NOTIFY
815   // Barrier imbalance - save arrive time to the thread
816   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
817     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
818         __itt_get_timestamp();
819   }
820 #endif
821   // Perform tree gather to wait until all threads have arrived; reduce any
822   // required data as we go
823   child_tid = (tid << branch_bits) + 1;
824   if (child_tid < nproc) {
825     // Parent threads wait for all their children to arrive
826     new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
827     child = 1;
828     do {
829       kmp_info_t *child_thr = other_threads[child_tid];
830       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
831 #if KMP_CACHE_MANAGE
832       // Prefetch next thread's arrived count
833       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
834         KMP_CACHE_PREFETCH(
835             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
836 #endif /* KMP_CACHE_MANAGE */
837       KA_TRACE(20,
838                ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
839                 "arrived(%p) == %llu\n",
840                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
841                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
842       // Wait for child to arrive
843       kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
844       flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
845 #if USE_ITT_BUILD && USE_ITT_NOTIFY
846       // Barrier imbalance - write min of the thread time and a child time to
847       // the thread.
848       if (__kmp_forkjoin_frames_mode == 2) {
849         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
850                                                child_thr->th.th_bar_min_time);
851       }
852 #endif
853       if (reduce) {
854         KA_TRACE(100,
855                  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
856                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
857                   team->t.t_id, child_tid));
858         OMPT_REDUCTION_DECL(this_thr, gtid);
859         OMPT_REDUCTION_BEGIN;
860         (*reduce)(this_thr->th.th_local.reduce_data,
861                   child_thr->th.th_local.reduce_data);
862         OMPT_REDUCTION_END;
863       }
864       child++;
865       child_tid++;
866     } while (child <= branch_factor && child_tid < nproc);
867   }
868 
869   if (!KMP_MASTER_TID(tid)) { // Worker threads
870     kmp_int32 parent_tid = (tid - 1) >> branch_bits;
871 
872     KA_TRACE(20,
873              ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
874               "arrived(%p): %llu => %llu\n",
875               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
876               team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
877               thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
878 
879     // Mark arrival to parent thread
880     /* After performing this write, a worker thread may not assume that the team
881        is valid any more - it could be deallocated by the primary thread at any
882        time.  */
883     kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
884     flag.release();
885   } else {
886     // Need to update the team arrived pointer if we are the primary thread
887     if (nproc > 1) // New value was already computed above
888       team->t.t_bar[bt].b_arrived = new_state;
889     else
890       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
891     KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
892                   "arrived(%p) = %llu\n",
893                   gtid, team->t.t_id, tid, team->t.t_id,
894                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
895   }
896   KA_TRACE(20,
897            ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
898             gtid, team->t.t_id, tid, bt));
899 }
900 
901 static void __kmp_tree_barrier_release(
902     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
903     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
904   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
905   kmp_team_t *team;
906   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
907   kmp_uint32 nproc;
908   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
909   kmp_uint32 branch_factor = 1 << branch_bits;
910   kmp_uint32 child;
911   kmp_uint32 child_tid;
912 
913   // Perform a tree release for all of the threads that have been gathered
914   if (!KMP_MASTER_TID(
915           tid)) { // Handle fork barrier workers who aren't part of a team yet
916     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
917                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
918     // Wait for parent thread to release us
919     kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
920     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
921 #if USE_ITT_BUILD && USE_ITT_NOTIFY
922     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
923       // In fork barrier where we could not get the object reliably (or
924       // ITTNOTIFY is disabled)
925       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
926       // Cancel wait on previous parallel region...
927       __kmp_itt_task_starting(itt_sync_obj);
928 
929       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
930         return;
931 
932       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
933       if (itt_sync_obj != NULL)
934         // Call prepare as early as possible for "new" barrier
935         __kmp_itt_task_finished(itt_sync_obj);
936     } else
937 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
938         // Early exit for reaping threads releasing forkjoin barrier
939         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
940       return;
941 
942     // The worker thread may now assume that the team is valid.
943     team = __kmp_threads[gtid]->th.th_team;
944     KMP_DEBUG_ASSERT(team != NULL);
945     tid = __kmp_tid_from_gtid(gtid);
946 
947     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
948     KA_TRACE(20,
949              ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
950               team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
951     KMP_MB(); // Flush all pending memory write invalidates.
952   } else {
953     team = __kmp_threads[gtid]->th.th_team;
954     KMP_DEBUG_ASSERT(team != NULL);
955     KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
956                   "barrier type %d\n",
957                   gtid, team->t.t_id, tid, bt));
958   }
959   nproc = this_thr->th.th_team_nproc;
960   child_tid = (tid << branch_bits) + 1;
961 
962   if (child_tid < nproc) {
963     kmp_info_t **other_threads = team->t.t_threads;
964     child = 1;
965     // Parent threads release all their children
966     do {
967       kmp_info_t *child_thr = other_threads[child_tid];
968       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
969 #if KMP_CACHE_MANAGE
970       // Prefetch next thread's go count
971       if (child + 1 <= branch_factor && child_tid + 1 < nproc)
972         KMP_CACHE_PREFETCH(
973             &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
974 #endif /* KMP_CACHE_MANAGE */
975 
976 #if KMP_BARRIER_ICV_PUSH
977       {
978         KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
979         if (propagate_icvs) {
980           __kmp_init_implicit_task(team->t.t_ident,
981                                    team->t.t_threads[child_tid], team,
982                                    child_tid, FALSE);
983           copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
984                     &team->t.t_implicit_task_taskdata[0].td_icvs);
985         }
986       }
987 #endif // KMP_BARRIER_ICV_PUSH
988       KA_TRACE(20,
989                ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
990                 "go(%p): %u => %u\n",
991                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
992                 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
993                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
994       // Release child from barrier
995       kmp_flag_64<> flag(&child_bar->b_go, child_thr);
996       flag.release();
997       child++;
998       child_tid++;
999     } while (child <= branch_factor && child_tid < nproc);
1000   }
1001   KA_TRACE(
1002       20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1003            gtid, team->t.t_id, tid, bt));
1004 }
1005 
1006 // Hyper Barrier
1007 static void __kmp_hyper_barrier_gather(
1008     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1009     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1010   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
1011   kmp_team_t *team = this_thr->th.th_team;
1012   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1013   kmp_info_t **other_threads = team->t.t_threads;
1014   kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
1015   kmp_uint32 num_threads = this_thr->th.th_team_nproc;
1016   kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
1017   kmp_uint32 branch_factor = 1 << branch_bits;
1018   kmp_uint32 offset;
1019   kmp_uint32 level;
1020 
1021   KA_TRACE(
1022       20,
1023       ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1024        gtid, team->t.t_id, tid, bt));
1025   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1026 
1027 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1028   // Barrier imbalance - save arrive time to the thread
1029   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1030     this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
1031         __itt_get_timestamp();
1032   }
1033 #endif
1034   /* Perform a hypercube-embedded tree gather to wait until all of the threads
1035      have arrived, and reduce any required data as we go.  */
1036   kmp_flag_64<> p_flag(&thr_bar->b_arrived);
1037   for (level = 0, offset = 1; offset < num_threads;
1038        level += branch_bits, offset <<= branch_bits) {
1039     kmp_uint32 child;
1040     kmp_uint32 child_tid;
1041 
1042     if (((tid >> level) & (branch_factor - 1)) != 0) {
1043       kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
1044 
1045       KMP_MB(); // Synchronize parent and child threads.
1046       KA_TRACE(20,
1047                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1048                 "arrived(%p): %llu => %llu\n",
1049                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
1050                 team->t.t_id, parent_tid, &thr_bar->b_arrived,
1051                 thr_bar->b_arrived,
1052                 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1053       // Mark arrival to parent thread
1054       /* After performing this write (in the last iteration of the enclosing for
1055          loop), a worker thread may not assume that the team is valid any more
1056          - it could be deallocated by the primary thread at any time.  */
1057       p_flag.set_waiter(other_threads[parent_tid]);
1058       p_flag.release();
1059       break;
1060     }
1061 
1062     // Parent threads wait for children to arrive
1063     if (new_state == KMP_BARRIER_UNUSED_STATE)
1064       new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1065     for (child = 1, child_tid = tid + (1 << level);
1066          child < branch_factor && child_tid < num_threads;
1067          child++, child_tid += (1 << level)) {
1068       kmp_info_t *child_thr = other_threads[child_tid];
1069       kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1070 #if KMP_CACHE_MANAGE
1071       kmp_uint32 next_child_tid = child_tid + (1 << level);
1072       // Prefetch next thread's arrived count
1073       if (child + 1 < branch_factor && next_child_tid < num_threads)
1074         KMP_CACHE_PREFETCH(
1075             &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
1076 #endif /* KMP_CACHE_MANAGE */
1077       KA_TRACE(20,
1078                ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
1079                 "arrived(%p) == %llu\n",
1080                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1081                 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
1082       // Wait for child to arrive
1083       kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
1084       c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1085       KMP_MB(); // Synchronize parent and child threads.
1086 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1087       // Barrier imbalance - write min of the thread time and a child time to
1088       // the thread.
1089       if (__kmp_forkjoin_frames_mode == 2) {
1090         this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
1091                                                child_thr->th.th_bar_min_time);
1092       }
1093 #endif
1094       if (reduce) {
1095         KA_TRACE(100,
1096                  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
1097                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1098                   team->t.t_id, child_tid));
1099         OMPT_REDUCTION_DECL(this_thr, gtid);
1100         OMPT_REDUCTION_BEGIN;
1101         (*reduce)(this_thr->th.th_local.reduce_data,
1102                   child_thr->th.th_local.reduce_data);
1103         OMPT_REDUCTION_END;
1104       }
1105     }
1106   }
1107 
1108   if (KMP_MASTER_TID(tid)) {
1109     // Need to update the team arrived pointer if we are the primary thread
1110     if (new_state == KMP_BARRIER_UNUSED_STATE)
1111       team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
1112     else
1113       team->t.t_bar[bt].b_arrived = new_state;
1114     KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
1115                   "arrived(%p) = %llu\n",
1116                   gtid, team->t.t_id, tid, team->t.t_id,
1117                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1118   }
1119   KA_TRACE(
1120       20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1121            gtid, team->t.t_id, tid, bt));
1122 }
1123 
1124 // The reverse versions seem to beat the forward versions overall
1125 #define KMP_REVERSE_HYPER_BAR
1126 static void __kmp_hyper_barrier_release(
1127     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1128     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1129   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
1130   kmp_team_t *team;
1131   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1132   kmp_info_t **other_threads;
1133   kmp_uint32 num_threads;
1134   kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
1135   kmp_uint32 branch_factor = 1 << branch_bits;
1136   kmp_uint32 child;
1137   kmp_uint32 child_tid;
1138   kmp_uint32 offset;
1139   kmp_uint32 level;
1140 
1141   /* Perform a hypercube-embedded tree release for all of the threads that have
1142      been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
1143      are released in the reverse order of the corresponding gather, otherwise
1144      threads are released in the same order. */
1145   if (KMP_MASTER_TID(tid)) { // primary thread
1146     team = __kmp_threads[gtid]->th.th_team;
1147     KMP_DEBUG_ASSERT(team != NULL);
1148     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
1149                   "barrier type %d\n",
1150                   gtid, team->t.t_id, tid, bt));
1151 #if KMP_BARRIER_ICV_PUSH
1152     if (propagate_icvs) { // primary already has ICVs in final destination; copy
1153       copy_icvs(&thr_bar->th_fixed_icvs,
1154                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1155     }
1156 #endif
1157   } else { // Handle fork barrier workers who aren't part of a team yet
1158     KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
1159                   &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
1160     // Wait for parent thread to release us
1161     kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1162     flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1163 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1164     if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
1165       // In fork barrier where we could not get the object reliably
1166       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
1167       // Cancel wait on previous parallel region...
1168       __kmp_itt_task_starting(itt_sync_obj);
1169 
1170       if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1171         return;
1172 
1173       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1174       if (itt_sync_obj != NULL)
1175         // Call prepare as early as possible for "new" barrier
1176         __kmp_itt_task_finished(itt_sync_obj);
1177     } else
1178 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1179         // Early exit for reaping threads releasing forkjoin barrier
1180         if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1181       return;
1182 
1183     // The worker thread may now assume that the team is valid.
1184     team = __kmp_threads[gtid]->th.th_team;
1185     KMP_DEBUG_ASSERT(team != NULL);
1186     tid = __kmp_tid_from_gtid(gtid);
1187 
1188     TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1189     KA_TRACE(20,
1190              ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1191               gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1192     KMP_MB(); // Flush all pending memory write invalidates.
1193   }
1194   num_threads = this_thr->th.th_team_nproc;
1195   other_threads = team->t.t_threads;
1196 
1197 #ifdef KMP_REVERSE_HYPER_BAR
1198   // Count up to correct level for parent
1199   for (level = 0, offset = 1;
1200        offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
1201        level += branch_bits, offset <<= branch_bits)
1202     ;
1203 
1204   // Now go down from there
1205   for (level -= branch_bits, offset >>= branch_bits; offset != 0;
1206        level -= branch_bits, offset >>= branch_bits)
1207 #else
1208   // Go down the tree, level by level
1209   for (level = 0, offset = 1; offset < num_threads;
1210        level += branch_bits, offset <<= branch_bits)
1211 #endif // KMP_REVERSE_HYPER_BAR
1212   {
1213 #ifdef KMP_REVERSE_HYPER_BAR
1214     /* Now go in reverse order through the children, highest to lowest.
1215        Initial setting of child is conservative here. */
1216     child = num_threads >> ((level == 0) ? level : level - 1);
1217     for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
1218         child_tid = tid + (child << level);
1219          child >= 1; child--, child_tid -= (1 << level))
1220 #else
1221     if (((tid >> level) & (branch_factor - 1)) != 0)
1222       // No need to go lower than this, since this is the level parent would be
1223       // notified
1224       break;
1225     // Iterate through children on this level of the tree
1226     for (child = 1, child_tid = tid + (1 << level);
1227          child < branch_factor && child_tid < num_threads;
1228          child++, child_tid += (1 << level))
1229 #endif // KMP_REVERSE_HYPER_BAR
1230     {
1231       if (child_tid >= num_threads)
1232         continue; // Child doesn't exist so keep going
1233       else {
1234         kmp_info_t *child_thr = other_threads[child_tid];
1235         kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1236 #if KMP_CACHE_MANAGE
1237         kmp_uint32 next_child_tid = child_tid - (1 << level);
1238 // Prefetch next thread's go count
1239 #ifdef KMP_REVERSE_HYPER_BAR
1240         if (child - 1 >= 1 && next_child_tid < num_threads)
1241 #else
1242         if (child + 1 < branch_factor && next_child_tid < num_threads)
1243 #endif // KMP_REVERSE_HYPER_BAR
1244           KMP_CACHE_PREFETCH(
1245               &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
1246 #endif /* KMP_CACHE_MANAGE */
1247 
1248 #if KMP_BARRIER_ICV_PUSH
1249         if (propagate_icvs) // push my fixed ICVs to my child
1250           copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1251 #endif // KMP_BARRIER_ICV_PUSH
1252 
1253         KA_TRACE(
1254             20,
1255             ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1256              "go(%p): %u => %u\n",
1257              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1258              team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1259              child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1260         // Release child from barrier
1261         kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1262         flag.release();
1263       }
1264     }
1265   }
1266 #if KMP_BARRIER_ICV_PUSH
1267   if (propagate_icvs &&
1268       !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
1269     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1270                              FALSE);
1271     copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1272               &thr_bar->th_fixed_icvs);
1273   }
1274 #endif
1275   KA_TRACE(
1276       20,
1277       ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1278        gtid, team->t.t_id, tid, bt));
1279 }
1280 
1281 // Hierarchical Barrier
1282 
1283 // Initialize thread barrier data
1284 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
1285    Performs the minimum amount of initialization required based on how the team
1286    has changed. Returns true if leaf children will require both on-core and
1287    traditional wake-up mechanisms. For example, if the team size increases,
1288    threads already in the team will respond to on-core wakeup on their parent
1289    thread, but threads newly added to the team will only be listening on the
1290    their local b_go. */
1291 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
1292                                                    kmp_bstate_t *thr_bar,
1293                                                    kmp_uint32 nproc, int gtid,
1294                                                    int tid, kmp_team_t *team) {
1295   // Checks to determine if (re-)initialization is needed
1296   bool uninitialized = thr_bar->team == NULL;
1297   bool team_changed = team != thr_bar->team;
1298   bool team_sz_changed = nproc != thr_bar->nproc;
1299   bool tid_changed = tid != thr_bar->old_tid;
1300   bool retval = false;
1301 
1302   if (uninitialized || team_sz_changed) {
1303     __kmp_get_hierarchy(nproc, thr_bar);
1304   }
1305 
1306   if (uninitialized || team_sz_changed || tid_changed) {
1307     thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
1308     thr_bar->parent_tid = -1; // default for primary thread
1309     if (!KMP_MASTER_TID(tid)) {
1310       // if not primary thread, find parent thread in hierarchy
1311       kmp_uint32 d = 0;
1312       while (d < thr_bar->depth) { // find parent based on level of thread in
1313         // hierarchy, and note level
1314         kmp_uint32 rem;
1315         if (d == thr_bar->depth - 2) { // reached level right below the primary
1316           thr_bar->parent_tid = 0;
1317           thr_bar->my_level = d;
1318           break;
1319         } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
1320           // TODO: can we make the above op faster?
1321           // thread is not a subtree root at next level, so this is max
1322           thr_bar->parent_tid = tid - rem;
1323           thr_bar->my_level = d;
1324           break;
1325         }
1326         ++d;
1327       }
1328     }
1329     __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
1330                             (thr_bar->skip_per_level[thr_bar->my_level])),
1331                        &(thr_bar->offset));
1332     thr_bar->old_tid = tid;
1333     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1334     thr_bar->team = team;
1335     thr_bar->parent_bar =
1336         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1337   }
1338   if (uninitialized || team_changed || tid_changed) {
1339     thr_bar->team = team;
1340     thr_bar->parent_bar =
1341         &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1342     retval = true;
1343   }
1344   if (uninitialized || team_sz_changed || tid_changed) {
1345     thr_bar->nproc = nproc;
1346     thr_bar->leaf_kids = thr_bar->base_leaf_kids;
1347     if (thr_bar->my_level == 0)
1348       thr_bar->leaf_kids = 0;
1349     if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
1350       __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
1351     thr_bar->leaf_state = 0;
1352     for (int i = 0; i < thr_bar->leaf_kids; ++i)
1353       ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
1354   }
1355   return retval;
1356 }
1357 
1358 static void __kmp_hierarchical_barrier_gather(
1359     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1360     void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1361   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
1362   kmp_team_t *team = this_thr->th.th_team;
1363   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1364   kmp_uint32 nproc = this_thr->th.th_team_nproc;
1365   kmp_info_t **other_threads = team->t.t_threads;
1366   kmp_uint64 new_state = 0;
1367 
1368   int level = team->t.t_level;
1369   if (other_threads[0]
1370           ->th.th_teams_microtask) // are we inside the teams construct?
1371     if (this_thr->th.th_teams_size.nteams > 1)
1372       ++level; // level was not increased in teams construct for team_of_masters
1373   if (level == 1)
1374     thr_bar->use_oncore_barrier = 1;
1375   else
1376     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1377 
1378   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
1379                 "barrier type %d\n",
1380                 gtid, team->t.t_id, tid, bt));
1381   KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1382 
1383 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1384   // Barrier imbalance - save arrive time to the thread
1385   if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1386     this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
1387   }
1388 #endif
1389 
1390   (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
1391                                                team);
1392 
1393   if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
1394     kmp_int32 child_tid;
1395     new_state =
1396         (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1397     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1398         thr_bar->use_oncore_barrier) {
1399       if (thr_bar->leaf_kids) {
1400         // First, wait for leaf children to check-in on my b_arrived flag
1401         kmp_uint64 leaf_state =
1402             KMP_MASTER_TID(tid)
1403                 ? thr_bar->b_arrived | thr_bar->leaf_state
1404                 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
1405         KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
1406                       "for leaf kids\n",
1407                       gtid, team->t.t_id, tid));
1408         kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
1409         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1410         if (reduce) {
1411           OMPT_REDUCTION_DECL(this_thr, gtid);
1412           OMPT_REDUCTION_BEGIN;
1413           for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
1414                ++child_tid) {
1415             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1416                            "T#%d(%d:%d)\n",
1417                            gtid, team->t.t_id, tid,
1418                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1419                            child_tid));
1420             (*reduce)(this_thr->th.th_local.reduce_data,
1421                       other_threads[child_tid]->th.th_local.reduce_data);
1422           }
1423           OMPT_REDUCTION_END;
1424         }
1425         // clear leaf_state bits
1426         KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
1427       }
1428       // Next, wait for higher level children on each child's b_arrived flag
1429       for (kmp_uint32 d = 1; d < thr_bar->my_level;
1430            ++d) { // gather lowest level threads first, but skip 0
1431         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1432                    skip = thr_bar->skip_per_level[d];
1433         if (last > nproc)
1434           last = nproc;
1435         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1436           kmp_info_t *child_thr = other_threads[child_tid];
1437           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1438           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1439                         "T#%d(%d:%d) "
1440                         "arrived(%p) == %llu\n",
1441                         gtid, team->t.t_id, tid,
1442                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1443                         child_tid, &child_bar->b_arrived, new_state));
1444           kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1445           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1446           if (reduce) {
1447             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1448                            "T#%d(%d:%d)\n",
1449                            gtid, team->t.t_id, tid,
1450                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1451                            child_tid));
1452             (*reduce)(this_thr->th.th_local.reduce_data,
1453                       child_thr->th.th_local.reduce_data);
1454           }
1455         }
1456       }
1457     } else { // Blocktime is not infinite
1458       for (kmp_uint32 d = 0; d < thr_bar->my_level;
1459            ++d) { // Gather lowest level threads first
1460         kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1461                    skip = thr_bar->skip_per_level[d];
1462         if (last > nproc)
1463           last = nproc;
1464         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1465           kmp_info_t *child_thr = other_threads[child_tid];
1466           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1467           KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1468                         "T#%d(%d:%d) "
1469                         "arrived(%p) == %llu\n",
1470                         gtid, team->t.t_id, tid,
1471                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1472                         child_tid, &child_bar->b_arrived, new_state));
1473           kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1474           flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1475           if (reduce) {
1476             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1477                            "T#%d(%d:%d)\n",
1478                            gtid, team->t.t_id, tid,
1479                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1480                            child_tid));
1481             (*reduce)(this_thr->th.th_local.reduce_data,
1482                       child_thr->th.th_local.reduce_data);
1483           }
1484         }
1485       }
1486     }
1487   }
1488   // All subordinates are gathered; now release parent if not primary thread
1489 
1490   if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1491     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1492                   " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1493                   gtid, team->t.t_id, tid,
1494                   __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1495                   thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1496                   thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1497     /* Mark arrival to parent: After performing this write, a worker thread may
1498        not assume that the team is valid any more - it could be deallocated by
1499        the primary thread at any time. */
1500     if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1501         !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1502       // flag; release it
1503       kmp_flag_64<> flag(&thr_bar->b_arrived,
1504                          other_threads[thr_bar->parent_tid]);
1505       flag.release();
1506     } else {
1507       // Leaf does special release on "offset" bits of parent's b_arrived flag
1508       thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1509       kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1510                            thr_bar->offset + 1);
1511       flag.set_waiter(other_threads[thr_bar->parent_tid]);
1512       flag.release();
1513     }
1514   } else { // Primary thread needs to update the team's b_arrived value
1515     team->t.t_bar[bt].b_arrived = new_state;
1516     KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1517                   "arrived(%p) = %llu\n",
1518                   gtid, team->t.t_id, tid, team->t.t_id,
1519                   &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1520   }
1521   // Is the team access below unsafe or just technically invalid?
1522   KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1523                 "barrier type %d\n",
1524                 gtid, team->t.t_id, tid, bt));
1525 }
1526 
1527 static void __kmp_hierarchical_barrier_release(
1528     enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1529     int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1530   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1531   kmp_team_t *team;
1532   kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1533   kmp_uint32 nproc;
1534   bool team_change = false; // indicates on-core barrier shouldn't be used
1535 
1536   if (KMP_MASTER_TID(tid)) {
1537     team = __kmp_threads[gtid]->th.th_team;
1538     KMP_DEBUG_ASSERT(team != NULL);
1539     KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1540                   "entered barrier type %d\n",
1541                   gtid, team->t.t_id, tid, bt));
1542   } else { // Worker threads
1543     // Wait for parent thread to release me
1544     if (!thr_bar->use_oncore_barrier ||
1545         __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1546         thr_bar->team == NULL) {
1547       // Use traditional method of waiting on my own b_go flag
1548       thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1549       kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1550       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1551       TCW_8(thr_bar->b_go,
1552             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1553     } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1554       // infinite, not nested
1555       // Wait on my "offset" bits on parent's b_go flag
1556       thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1557       kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1558                            thr_bar->offset + 1, bt,
1559                            this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1560       flag.wait(this_thr, TRUE);
1561       if (thr_bar->wait_flag ==
1562           KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1563         TCW_8(thr_bar->b_go,
1564               KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1565       } else { // Reset my bits on parent's b_go flag
1566         (RCAST(volatile char *,
1567                &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1568       }
1569     }
1570     thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1571     // Early exit for reaping threads releasing forkjoin barrier
1572     if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1573       return;
1574     // The worker thread may now assume that the team is valid.
1575     team = __kmp_threads[gtid]->th.th_team;
1576     KMP_DEBUG_ASSERT(team != NULL);
1577     tid = __kmp_tid_from_gtid(gtid);
1578 
1579     KA_TRACE(
1580         20,
1581         ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1582          gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1583     KMP_MB(); // Flush all pending memory write invalidates.
1584   }
1585 
1586   nproc = this_thr->th.th_team_nproc;
1587   int level = team->t.t_level;
1588   if (team->t.t_threads[0]
1589           ->th.th_teams_microtask) { // are we inside the teams construct?
1590     if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1591         this_thr->th.th_teams_level == level)
1592       ++level; // level was not increased in teams construct for team_of_workers
1593     if (this_thr->th.th_teams_size.nteams > 1)
1594       ++level; // level was not increased in teams construct for team_of_masters
1595   }
1596   if (level == 1)
1597     thr_bar->use_oncore_barrier = 1;
1598   else
1599     thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1600 
1601   // If the team size has increased, we still communicate with old leaves via
1602   // oncore barrier.
1603   unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1604   kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1605   team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1606                                                        tid, team);
1607   // But if the entire team changes, we won't use oncore barrier at all
1608   if (team_change)
1609     old_leaf_kids = 0;
1610 
1611 #if KMP_BARRIER_ICV_PUSH
1612   if (propagate_icvs) {
1613     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1614                              FALSE);
1615     if (KMP_MASTER_TID(
1616             tid)) { // primary already has copy in final destination; copy
1617       copy_icvs(&thr_bar->th_fixed_icvs,
1618                 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1619     } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1620                thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1621       if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1622         // leaves (on-core children) pull parent's fixed ICVs directly to local
1623         // ICV store
1624         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1625                   &thr_bar->parent_bar->th_fixed_icvs);
1626       // non-leaves will get ICVs piggybacked with b_go via NGO store
1627     } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1628       if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1629         // access
1630         copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1631       else // leaves copy parent's fixed ICVs directly to local ICV store
1632         copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1633                   &thr_bar->parent_bar->th_fixed_icvs);
1634     }
1635   }
1636 #endif // KMP_BARRIER_ICV_PUSH
1637 
1638   // Now, release my children
1639   if (thr_bar->my_level) { // not a leaf
1640     kmp_int32 child_tid;
1641     kmp_uint32 last;
1642     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1643         thr_bar->use_oncore_barrier) {
1644       if (KMP_MASTER_TID(tid)) { // do a flat release
1645         // Set local b_go to bump children via NGO store of the cache line
1646         // containing IVCs and b_go.
1647         thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1648         // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1649         // the cache line
1650         ngo_load(&thr_bar->th_fixed_icvs);
1651         // This loops over all the threads skipping only the leaf nodes in the
1652         // hierarchy
1653         for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1654              child_tid += thr_bar->skip_per_level[1]) {
1655           kmp_bstate_t *child_bar =
1656               &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1657           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1658                         "releasing T#%d(%d:%d)"
1659                         " go(%p): %u => %u\n",
1660                         gtid, team->t.t_id, tid,
1661                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1662                         child_tid, &child_bar->b_go, child_bar->b_go,
1663                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1664           // Use ngo store (if available) to both store ICVs and release child
1665           // via child's b_go
1666           ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1667         }
1668         ngo_sync();
1669       }
1670       TCW_8(thr_bar->b_go,
1671             KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1672       // Now, release leaf children
1673       if (thr_bar->leaf_kids) { // if there are any
1674         // We test team_change on the off-chance that the level 1 team changed.
1675         if (team_change ||
1676             old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1677           if (old_leaf_kids) { // release old leaf kids
1678             thr_bar->b_go |= old_leaf_state;
1679           }
1680           // Release new leaf kids
1681           last = tid + thr_bar->skip_per_level[1];
1682           if (last > nproc)
1683             last = nproc;
1684           for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1685                ++child_tid) { // skip_per_level[0]=1
1686             kmp_info_t *child_thr = team->t.t_threads[child_tid];
1687             kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1688             KA_TRACE(
1689                 20,
1690                 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1691                  " T#%d(%d:%d) go(%p): %u => %u\n",
1692                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1693                  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1694                  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1695             // Release child using child's b_go flag
1696             kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1697             flag.release();
1698           }
1699         } else { // Release all children at once with leaf_state bits on my own
1700           // b_go flag
1701           thr_bar->b_go |= thr_bar->leaf_state;
1702         }
1703       }
1704     } else { // Blocktime is not infinite; do a simple hierarchical release
1705       for (int d = thr_bar->my_level - 1; d >= 0;
1706            --d) { // Release highest level threads first
1707         last = tid + thr_bar->skip_per_level[d + 1];
1708         kmp_uint32 skip = thr_bar->skip_per_level[d];
1709         if (last > nproc)
1710           last = nproc;
1711         for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1712           kmp_info_t *child_thr = team->t.t_threads[child_tid];
1713           kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1714           KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1715                         "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1716                         gtid, team->t.t_id, tid,
1717                         __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1718                         child_tid, &child_bar->b_go, child_bar->b_go,
1719                         child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1720           // Release child using child's b_go flag
1721           kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1722           flag.release();
1723         }
1724       }
1725     }
1726 #if KMP_BARRIER_ICV_PUSH
1727     if (propagate_icvs && !KMP_MASTER_TID(tid))
1728       // non-leaves copy ICVs from fixed ICVs to local dest
1729       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1730                 &thr_bar->th_fixed_icvs);
1731 #endif // KMP_BARRIER_ICV_PUSH
1732   }
1733   KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1734                 "barrier type %d\n",
1735                 gtid, team->t.t_id, tid, bt));
1736 }
1737 
1738 // End of Barrier Algorithms
1739 
1740 // type traits for cancellable value
1741 // if cancellable is true, then is_cancellable is a normal boolean variable
1742 // if cancellable is false, then is_cancellable is a compile time constant
1743 template <bool cancellable> struct is_cancellable {};
1744 template <> struct is_cancellable<true> {
1745   bool value;
1746   is_cancellable() : value(false) {}
1747   is_cancellable(bool b) : value(b) {}
1748   is_cancellable &operator=(bool b) {
1749     value = b;
1750     return *this;
1751   }
1752   operator bool() const { return value; }
1753 };
1754 template <> struct is_cancellable<false> {
1755   is_cancellable &operator=(bool b) { return *this; }
1756   constexpr operator bool() const { return false; }
1757 };
1758 
1759 // Internal function to do a barrier.
1760 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1761    If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1762    barrier
1763    When cancellable = false,
1764      Returns 0 if primary thread, 1 if worker thread.
1765    When cancellable = true
1766      Returns 0 if not cancelled, 1 if cancelled.  */
1767 template <bool cancellable = false>
1768 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1769                                   size_t reduce_size, void *reduce_data,
1770                                   void (*reduce)(void *, void *)) {
1771   KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1772   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1773   int tid = __kmp_tid_from_gtid(gtid);
1774   kmp_info_t *this_thr = __kmp_threads[gtid];
1775   kmp_team_t *team = this_thr->th.th_team;
1776   int status = 0;
1777   is_cancellable<cancellable> cancelled;
1778 #if OMPT_SUPPORT && OMPT_OPTIONAL
1779   ompt_data_t *my_task_data;
1780   ompt_data_t *my_parallel_data;
1781   void *return_address;
1782   ompt_sync_region_t barrier_kind;
1783 #endif
1784 
1785   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1786                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1787 
1788 #if OMPT_SUPPORT
1789   if (ompt_enabled.enabled) {
1790 #if OMPT_OPTIONAL
1791     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1792     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1793     return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1794     barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1795     if (ompt_enabled.ompt_callback_sync_region) {
1796       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1797           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1798           return_address);
1799     }
1800     if (ompt_enabled.ompt_callback_sync_region_wait) {
1801       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1802           barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1803           return_address);
1804     }
1805 #endif
1806     // It is OK to report the barrier state after the barrier begin callback.
1807     // According to the OMPT specification, a compliant implementation may
1808     // even delay reporting this state until the barrier begins to wait.
1809     auto *ompt_thr_info = &this_thr->th.ompt_thread_info;
1810     switch (barrier_kind) {
1811     case ompt_sync_region_barrier_explicit:
1812       ompt_thr_info->state = ompt_state_wait_barrier_explicit;
1813       break;
1814     case ompt_sync_region_barrier_implicit_workshare:
1815       ompt_thr_info->state = ompt_state_wait_barrier_implicit_workshare;
1816       break;
1817     case ompt_sync_region_barrier_implicit_parallel:
1818       ompt_thr_info->state = ompt_state_wait_barrier_implicit_parallel;
1819       break;
1820     case ompt_sync_region_barrier_teams:
1821       ompt_thr_info->state = ompt_state_wait_barrier_teams;
1822       break;
1823     case ompt_sync_region_barrier_implementation:
1824       [[fallthrough]];
1825     default:
1826       ompt_thr_info->state = ompt_state_wait_barrier_implementation;
1827     }
1828   }
1829 #endif
1830 
1831   if (!team->t.t_serialized) {
1832 #if USE_ITT_BUILD
1833     // This value will be used in itt notify events below.
1834     void *itt_sync_obj = NULL;
1835 #if USE_ITT_NOTIFY
1836     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1837       itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1838 #endif
1839 #endif /* USE_ITT_BUILD */
1840     if (__kmp_tasking_mode == tskm_extra_barrier) {
1841       __kmp_tasking_barrier(team, this_thr, gtid);
1842       KA_TRACE(15,
1843                ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1844                 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1845     }
1846 
1847     /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1848        access it when the team struct is not guaranteed to exist. */
1849     // See note about the corresponding code in __kmp_join_barrier() being
1850     // performance-critical.
1851     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1852 #if KMP_USE_MONITOR
1853       this_thr->th.th_team_bt_intervals =
1854           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1855       this_thr->th.th_team_bt_set =
1856           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1857 #else
1858       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1859 #endif
1860     }
1861 
1862 #if USE_ITT_BUILD
1863     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1864       __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1865 #endif /* USE_ITT_BUILD */
1866 #if USE_DEBUGGER
1867     // Let the debugger know: the thread arrived to the barrier and waiting.
1868     if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
1869       team->t.t_bar[bt].b_master_arrived += 1;
1870     } else {
1871       this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1872     } // if
1873 #endif /* USE_DEBUGGER */
1874     if (reduce != NULL) {
1875       // KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
1876       this_thr->th.th_local.reduce_data = reduce_data;
1877     }
1878 
1879     if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1880       __kmp_task_team_setup(this_thr, team);
1881 
1882     if (cancellable) {
1883       cancelled = __kmp_linear_barrier_gather_cancellable(
1884           bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1885     } else {
1886       switch (__kmp_barrier_gather_pattern[bt]) {
1887       case bp_dist_bar: {
1888         __kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
1889                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1890         break;
1891       }
1892       case bp_hyper_bar: {
1893         // don't set branch bits to 0; use linear
1894         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1895         __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1896                                    reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1897         break;
1898       }
1899       case bp_hierarchical_bar: {
1900         __kmp_hierarchical_barrier_gather(
1901             bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1902         break;
1903       }
1904       case bp_tree_bar: {
1905         // don't set branch bits to 0; use linear
1906         KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1907         __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1908                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1909         break;
1910       }
1911       default: {
1912         __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1913                                     reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1914       }
1915       }
1916     }
1917 
1918     KMP_MB();
1919 
1920     if (KMP_MASTER_TID(tid)) {
1921       status = 0;
1922       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1923         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1924       }
1925 #if USE_DEBUGGER
1926       // Let the debugger know: All threads are arrived and starting leaving the
1927       // barrier.
1928       team->t.t_bar[bt].b_team_arrived += 1;
1929 #endif
1930 
1931       if (__kmp_omp_cancellation) {
1932         kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1933         // Reset cancellation flag for worksharing constructs
1934         if (cancel_request == cancel_loop ||
1935             cancel_request == cancel_sections) {
1936           KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1937         }
1938       }
1939 #if USE_ITT_BUILD
1940       /* TODO: In case of split reduction barrier, primary thread may send
1941          acquired event early, before the final summation into the shared
1942          variable is done (final summation can be a long operation for array
1943          reductions).  */
1944       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1945         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1946 #endif /* USE_ITT_BUILD */
1947 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1948       // Barrier - report frame end (only if active_level == 1)
1949       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1950           __kmp_forkjoin_frames_mode &&
1951           (this_thr->th.th_teams_microtask == NULL || // either not in teams
1952            this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1953           team->t.t_active_level == 1) {
1954         ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1955         kmp_uint64 cur_time = __itt_get_timestamp();
1956         kmp_info_t **other_threads = team->t.t_threads;
1957         int nproc = this_thr->th.th_team_nproc;
1958         int i;
1959         switch (__kmp_forkjoin_frames_mode) {
1960         case 1:
1961           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1962                                  loc, nproc);
1963           this_thr->th.th_frame_time = cur_time;
1964           break;
1965         case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1966           // be fixed)
1967           __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1968                                  1, loc, nproc);
1969           break;
1970         case 3:
1971           if (__itt_metadata_add_ptr) {
1972             // Initialize with primary thread's wait time
1973             kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1974             // Set arrive time to zero to be able to check it in
1975             // __kmp_invoke_task(); the same is done inside the loop below
1976             this_thr->th.th_bar_arrive_time = 0;
1977             for (i = 1; i < nproc; ++i) {
1978               delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1979               other_threads[i]->th.th_bar_arrive_time = 0;
1980             }
1981             __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1982                                          cur_time, delta,
1983                                          (kmp_uint64)(reduce != NULL));
1984           }
1985           __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1986                                  loc, nproc);
1987           this_thr->th.th_frame_time = cur_time;
1988           break;
1989         }
1990       }
1991 #endif /* USE_ITT_BUILD */
1992     } else {
1993       status = 1;
1994 #if USE_ITT_BUILD
1995       if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1996         __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1997 #endif /* USE_ITT_BUILD */
1998     }
1999     if ((status == 1 || !is_split) && !cancelled) {
2000       if (cancellable) {
2001         cancelled = __kmp_linear_barrier_release_cancellable(
2002             bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2003       } else {
2004         switch (__kmp_barrier_release_pattern[bt]) {
2005         case bp_dist_bar: {
2006           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2007           __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2008                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2009           break;
2010         }
2011         case bp_hyper_bar: {
2012           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2013           __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2014                                       FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2015           break;
2016         }
2017         case bp_hierarchical_bar: {
2018           __kmp_hierarchical_barrier_release(
2019               bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2020           break;
2021         }
2022         case bp_tree_bar: {
2023           KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2024           __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2025                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2026           break;
2027         }
2028         default: {
2029           __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2030                                        FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2031         }
2032         }
2033       }
2034       if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
2035         __kmp_task_team_sync(this_thr, team);
2036       }
2037     }
2038 
2039 #if USE_ITT_BUILD
2040     /* GEH: TODO: Move this under if-condition above and also include in
2041        __kmp_end_split_barrier(). This will more accurately represent the actual
2042        release time of the threads for split barriers.  */
2043     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2044       __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2045 #endif /* USE_ITT_BUILD */
2046   } else { // Team is serialized.
2047     status = 0;
2048     if (__kmp_tasking_mode != tskm_immediate_exec) {
2049       if (this_thr->th.th_task_team != NULL) {
2050 #if USE_ITT_NOTIFY
2051         void *itt_sync_obj = NULL;
2052         if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2053           itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
2054           __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2055         }
2056 #endif
2057 
2058         KMP_DEBUG_ASSERT(
2059             this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE ||
2060             this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
2061                 TRUE);
2062         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2063         __kmp_task_team_setup(this_thr, team);
2064 
2065 #if USE_ITT_BUILD
2066         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2067           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2068 #endif /* USE_ITT_BUILD */
2069       }
2070     }
2071   }
2072   KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
2073                 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
2074                 __kmp_tid_from_gtid(gtid), status));
2075 
2076 #if OMPT_SUPPORT
2077   if (ompt_enabled.enabled) {
2078 #if OMPT_OPTIONAL
2079     if (ompt_enabled.ompt_callback_sync_region_wait) {
2080       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2081           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2082           return_address);
2083     }
2084     if (ompt_enabled.ompt_callback_sync_region) {
2085       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2086           barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2087           return_address);
2088     }
2089 #endif
2090     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
2091   }
2092 #endif
2093 
2094   if (cancellable)
2095     return (int)cancelled;
2096   return status;
2097 }
2098 
2099 // Returns 0 if primary thread, 1 if worker thread.
2100 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
2101                   size_t reduce_size, void *reduce_data,
2102                   void (*reduce)(void *, void *)) {
2103   return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
2104                                   reduce);
2105 }
2106 
2107 #if defined(KMP_GOMP_COMPAT)
2108 // Returns 1 if cancelled, 0 otherwise
2109 int __kmp_barrier_gomp_cancel(int gtid) {
2110   if (__kmp_omp_cancellation) {
2111     int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
2112                                                  0, NULL, NULL);
2113     if (cancelled) {
2114       int tid = __kmp_tid_from_gtid(gtid);
2115       kmp_info_t *this_thr = __kmp_threads[gtid];
2116       if (KMP_MASTER_TID(tid)) {
2117         // Primary thread does not need to revert anything
2118       } else {
2119         // Workers need to revert their private b_arrived flag
2120         this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
2121             KMP_BARRIER_STATE_BUMP;
2122       }
2123     }
2124     return cancelled;
2125   }
2126   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2127   return FALSE;
2128 }
2129 #endif
2130 
2131 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
2132   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
2133   KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
2134   KMP_DEBUG_ASSERT(bt < bs_last_barrier);
2135   int tid = __kmp_tid_from_gtid(gtid);
2136   kmp_info_t *this_thr = __kmp_threads[gtid];
2137   kmp_team_t *team = this_thr->th.th_team;
2138 
2139   if (!team->t.t_serialized) {
2140     if (KMP_MASTER_GTID(gtid)) {
2141       switch (__kmp_barrier_release_pattern[bt]) {
2142       case bp_dist_bar: {
2143         __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2144                                    FALSE USE_ITT_BUILD_ARG(NULL));
2145         break;
2146       }
2147       case bp_hyper_bar: {
2148         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2149         __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2150                                     FALSE USE_ITT_BUILD_ARG(NULL));
2151         break;
2152       }
2153       case bp_hierarchical_bar: {
2154         __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
2155                                            FALSE USE_ITT_BUILD_ARG(NULL));
2156         break;
2157       }
2158       case bp_tree_bar: {
2159         KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2160         __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2161                                    FALSE USE_ITT_BUILD_ARG(NULL));
2162         break;
2163       }
2164       default: {
2165         __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2166                                      FALSE USE_ITT_BUILD_ARG(NULL));
2167       }
2168       }
2169       if (__kmp_tasking_mode != tskm_immediate_exec) {
2170         __kmp_task_team_sync(this_thr, team);
2171       } // if
2172     }
2173   }
2174 }
2175 
2176 void __kmp_join_barrier(int gtid) {
2177   KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
2178   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2179 
2180   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
2181 
2182   kmp_info_t *this_thr = __kmp_threads[gtid];
2183   kmp_team_t *team;
2184   int tid;
2185 #ifdef KMP_DEBUG
2186   int team_id;
2187 #endif /* KMP_DEBUG */
2188 #if USE_ITT_BUILD
2189   void *itt_sync_obj = NULL;
2190 #if USE_ITT_NOTIFY
2191   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
2192     // Get object created at fork_barrier
2193     itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2194 #endif
2195 #endif /* USE_ITT_BUILD */
2196 #if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG)
2197   int nproc = this_thr->th.th_team_nproc;
2198 #endif
2199   KMP_MB();
2200 
2201   // Get current info
2202   team = this_thr->th.th_team;
2203   KMP_DEBUG_ASSERT(nproc == team->t.t_nproc);
2204   tid = __kmp_tid_from_gtid(gtid);
2205 #ifdef KMP_DEBUG
2206   team_id = team->t.t_id;
2207   kmp_info_t *master_thread = this_thr->th.th_team_master;
2208   if (master_thread != team->t.t_threads[0]) {
2209     __kmp_print_structure();
2210   }
2211 #endif /* KMP_DEBUG */
2212   KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
2213   KMP_MB();
2214 
2215   // Verify state
2216   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
2217   KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
2218   KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
2219   KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
2220                 gtid, team_id, tid));
2221 
2222 #if OMPT_SUPPORT
2223   if (ompt_enabled.enabled) {
2224 #if OMPT_OPTIONAL
2225     ompt_data_t *my_task_data;
2226     ompt_data_t *my_parallel_data;
2227     void *codeptr = NULL;
2228     int ds_tid = this_thr->th.th_info.ds.ds_tid;
2229     if (KMP_MASTER_TID(ds_tid) &&
2230         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2231          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2232       codeptr = team->t.ompt_team_info.master_return_address;
2233     my_task_data = OMPT_CUR_TASK_DATA(this_thr);
2234     my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
2235     ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2236     ompt_state_t ompt_state = ompt_state_wait_barrier_implicit_parallel;
2237     if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) {
2238       sync_kind = ompt_sync_region_barrier_teams;
2239       ompt_state = ompt_state_wait_barrier_teams;
2240     }
2241     if (ompt_enabled.ompt_callback_sync_region) {
2242       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2243           sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2244     }
2245     if (ompt_enabled.ompt_callback_sync_region_wait) {
2246       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2247           sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2248     }
2249     if (!KMP_MASTER_TID(ds_tid))
2250       this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
2251 #endif
2252     this_thr->th.ompt_thread_info.state = ompt_state;
2253   }
2254 #endif
2255 
2256   if (__kmp_tasking_mode == tskm_extra_barrier) {
2257     __kmp_tasking_barrier(team, this_thr, gtid);
2258     KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
2259                   gtid, team_id, tid));
2260   }
2261 #ifdef KMP_DEBUG
2262   if (__kmp_tasking_mode != tskm_immediate_exec) {
2263     KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
2264                   "%p, th_task_team = %p\n",
2265                   __kmp_gtid_from_thread(this_thr), team_id,
2266                   team->t.t_task_team[this_thr->th.th_task_state],
2267                   this_thr->th.th_task_team));
2268     KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr);
2269   }
2270 #endif /* KMP_DEBUG */
2271 
2272   /* Copy the blocktime info to the thread, where __kmp_wait_template() can
2273      access it when the team struct is not guaranteed to exist. Doing these
2274      loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
2275      we do not perform the copy if blocktime=infinite, since the values are not
2276      used by __kmp_wait_template() in that case. */
2277   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2278 #if KMP_USE_MONITOR
2279     this_thr->th.th_team_bt_intervals =
2280         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2281     this_thr->th.th_team_bt_set =
2282         team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2283 #else
2284     this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2285 #endif
2286   }
2287 
2288 #if USE_ITT_BUILD
2289   if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2290     __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2291 #endif /* USE_ITT_BUILD */
2292 
2293   switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
2294   case bp_dist_bar: {
2295     __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2296                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2297     break;
2298   }
2299   case bp_hyper_bar: {
2300     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2301     __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2302                                NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2303     break;
2304   }
2305   case bp_hierarchical_bar: {
2306     __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2307                                       NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2308     break;
2309   }
2310   case bp_tree_bar: {
2311     KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2312     __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2313                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2314     break;
2315   }
2316   default: {
2317     __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2318                                 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2319   }
2320   }
2321 
2322   /* From this point on, the team data structure may be deallocated at any time
2323      by the primary thread - it is unsafe to reference it in any of the worker
2324      threads. Any per-team data items that need to be referenced before the
2325      end of the barrier should be moved to the kmp_task_team_t structs.  */
2326   if (KMP_MASTER_TID(tid)) {
2327     if (__kmp_tasking_mode != tskm_immediate_exec) {
2328       __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2329     }
2330     if (__kmp_display_affinity) {
2331       KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
2332     }
2333 #if KMP_STATS_ENABLED
2334     // Have primary thread flag the workers to indicate they are now waiting for
2335     // next parallel region, Also wake them up so they switch their timers to
2336     // idle.
2337     for (int i = 0; i < team->t.t_nproc; ++i) {
2338       kmp_info_t *team_thread = team->t.t_threads[i];
2339       if (team_thread == this_thr)
2340         continue;
2341       team_thread->th.th_stats->setIdleFlag();
2342       if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2343           team_thread->th.th_sleep_loc != NULL)
2344         __kmp_null_resume_wrapper(team_thread);
2345     }
2346 #endif
2347 #if USE_ITT_BUILD
2348     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2349       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2350 #endif /* USE_ITT_BUILD */
2351 
2352 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2353     // Join barrier - report frame end
2354     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2355         __kmp_forkjoin_frames_mode &&
2356         (this_thr->th.th_teams_microtask == NULL || // either not in teams
2357          this_thr->th.th_teams_size.nteams == 1) && // or inside single team
2358         team->t.t_active_level == 1) {
2359       kmp_uint64 cur_time = __itt_get_timestamp();
2360       ident_t *loc = team->t.t_ident;
2361       kmp_info_t **other_threads = team->t.t_threads;
2362       switch (__kmp_forkjoin_frames_mode) {
2363       case 1:
2364         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2365                                loc, nproc);
2366         break;
2367       case 2:
2368         __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
2369                                loc, nproc);
2370         break;
2371       case 3:
2372         if (__itt_metadata_add_ptr) {
2373           // Initialize with primary thread's wait time
2374           kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
2375           // Set arrive time to zero to be able to check it in
2376           // __kmp_invoke_task(); the same is done inside the loop below
2377           this_thr->th.th_bar_arrive_time = 0;
2378           for (int i = 1; i < nproc; ++i) {
2379             delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2380             other_threads[i]->th.th_bar_arrive_time = 0;
2381           }
2382           __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2383                                        cur_time, delta, 0);
2384         }
2385         __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2386                                loc, nproc);
2387         this_thr->th.th_frame_time = cur_time;
2388         break;
2389       }
2390     }
2391 #endif /* USE_ITT_BUILD */
2392   }
2393 #if USE_ITT_BUILD
2394   else {
2395     if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2396       __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2397   }
2398 #endif /* USE_ITT_BUILD */
2399 
2400 #if KMP_DEBUG
2401   if (KMP_MASTER_TID(tid)) {
2402     KA_TRACE(
2403         15,
2404         ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
2405          gtid, team_id, tid, nproc));
2406   }
2407 #endif /* KMP_DEBUG */
2408 
2409   // TODO now, mark worker threads as done so they may be disbanded
2410   KMP_MB(); // Flush all pending memory write invalidates.
2411   KA_TRACE(10,
2412            ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
2413 
2414 }
2415 
2416 // TODO release worker threads' fork barriers as we are ready instead of all at
2417 // once
2418 void __kmp_fork_barrier(int gtid, int tid) {
2419   KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
2420   KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2421   kmp_info_t *this_thr = __kmp_threads[gtid];
2422   kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
2423 #if USE_ITT_BUILD
2424   void *itt_sync_obj = NULL;
2425 #endif /* USE_ITT_BUILD */
2426 #ifdef KMP_DEBUG
2427   if (team)
2428     KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
2429                   (team != NULL) ? team->t.t_id : -1, tid));
2430 #endif
2431   // th_team pointer only valid for primary thread here
2432   if (KMP_MASTER_TID(tid)) {
2433 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2434     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2435       // Create itt barrier object
2436       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
2437       __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
2438     }
2439 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2440 
2441 #ifdef KMP_DEBUG
2442     KMP_DEBUG_ASSERT(team);
2443     kmp_info_t **other_threads = team->t.t_threads;
2444     int i;
2445 
2446     // Verify state
2447     KMP_MB();
2448 
2449     for (i = 1; i < team->t.t_nproc; ++i) {
2450       KA_TRACE(500,
2451                ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
2452                 "== %u.\n",
2453                 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
2454                 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
2455                 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
2456       KMP_DEBUG_ASSERT(
2457           (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
2458            ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
2459       KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
2460     }
2461 #endif
2462 
2463     if (__kmp_tasking_mode != tskm_immediate_exec)
2464       __kmp_task_team_setup(this_thr, team);
2465 
2466     /* The primary thread may have changed its blocktime between join barrier
2467        and fork barrier. Copy the blocktime info to the thread, where
2468        __kmp_wait_template() can access it when the team struct is not
2469        guaranteed to exist. */
2470     // See note about the corresponding code in __kmp_join_barrier() being
2471     // performance-critical
2472     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2473 #if KMP_USE_MONITOR
2474       this_thr->th.th_team_bt_intervals =
2475           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2476       this_thr->th.th_team_bt_set =
2477           team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2478 #else
2479       this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2480 #endif
2481     }
2482   } // primary thread
2483 
2484   switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
2485   case bp_dist_bar: {
2486     __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2487                                TRUE USE_ITT_BUILD_ARG(NULL));
2488     break;
2489   }
2490   case bp_hyper_bar: {
2491     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2492     __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2493                                 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2494     break;
2495   }
2496   case bp_hierarchical_bar: {
2497     __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2498                                        TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2499     break;
2500   }
2501   case bp_tree_bar: {
2502     KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2503     __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2504                                TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2505     break;
2506   }
2507   default: {
2508     __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2509                                  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2510   }
2511   }
2512 
2513 #if OMPT_SUPPORT
2514   ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
2515   if (ompt_enabled.enabled &&
2516       (ompt_state == ompt_state_wait_barrier_teams ||
2517        ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
2518     int ds_tid = this_thr->th.th_info.ds.ds_tid;
2519     ompt_data_t *task_data = (team)
2520                                  ? OMPT_CUR_TASK_DATA(this_thr)
2521                                  : &(this_thr->th.ompt_thread_info.task_data);
2522     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2523 #if OMPT_OPTIONAL
2524     void *codeptr = NULL;
2525     if (KMP_MASTER_TID(ds_tid) &&
2526         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2527          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2528       codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
2529     ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2530     if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
2531       sync_kind = ompt_sync_region_barrier_teams;
2532     if (ompt_enabled.ompt_callback_sync_region_wait) {
2533       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2534           sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2535     }
2536     if (ompt_enabled.ompt_callback_sync_region) {
2537       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2538           sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2539     }
2540 #endif
2541     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2542       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2543           ompt_scope_end, NULL, task_data, 0, ds_tid,
2544           ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2545     }
2546   }
2547 #endif
2548 
2549   // Early exit for reaping threads releasing forkjoin barrier
2550   if (TCR_4(__kmp_global.g.g_done)) {
2551     this_thr->th.th_task_team = NULL;
2552 
2553 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2554     if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2555       if (!KMP_MASTER_TID(tid)) {
2556         itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2557         if (itt_sync_obj)
2558           __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2559       }
2560     }
2561 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2562     KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2563     return;
2564   }
2565 
2566   /* We can now assume that a valid team structure has been allocated by the
2567      primary thread and propagated to all worker threads. The current thread,
2568      however, may not be part of the team, so we can't blindly assume that the
2569      team pointer is non-null.  */
2570   team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2571   KMP_DEBUG_ASSERT(team != NULL);
2572   tid = __kmp_tid_from_gtid(gtid);
2573 
2574 #if KMP_BARRIER_ICV_PULL
2575   /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2576      __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2577      implicit task has this data before this function is called. We cannot
2578      modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2579      thread struct, because it is not always the case that the threads arrays
2580      have been allocated when __kmp_fork_call() is executed. */
2581   {
2582     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2583     if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
2584       // Copy the initial ICVs from the primary thread's thread struct to the
2585       // implicit task for this tid.
2586       KA_TRACE(10,
2587                ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2588       __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2589                                tid, FALSE);
2590       copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2591                 &team->t.t_threads[0]
2592                      ->th.th_bar[bs_forkjoin_barrier]
2593                      .bb.th_fixed_icvs);
2594     }
2595   }
2596 #endif // KMP_BARRIER_ICV_PULL
2597 
2598   if (__kmp_tasking_mode != tskm_immediate_exec) {
2599     __kmp_task_team_sync(this_thr, team);
2600   }
2601 
2602 #if KMP_AFFINITY_SUPPORTED
2603   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2604   if (proc_bind == proc_bind_intel) {
2605     // Call dynamic affinity settings
2606     if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) {
2607       __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2608     }
2609   } else if (proc_bind != proc_bind_false) {
2610     if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2611       KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2612                      __kmp_gtid_from_thread(this_thr),
2613                      this_thr->th.th_current_place));
2614     } else {
2615       __kmp_affinity_bind_place(gtid);
2616     }
2617   }
2618 #endif // KMP_AFFINITY_SUPPORTED
2619   // Perform the display affinity functionality
2620   if (__kmp_display_affinity) {
2621     if (team->t.t_display_affinity
2622 #if KMP_AFFINITY_SUPPORTED
2623         || (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed)
2624 #endif
2625     ) {
2626       // NULL means use the affinity-format-var ICV
2627       __kmp_aux_display_affinity(gtid, NULL);
2628       this_thr->th.th_prev_num_threads = team->t.t_nproc;
2629       this_thr->th.th_prev_level = team->t.t_level;
2630     }
2631   }
2632   if (!KMP_MASTER_TID(tid))
2633     KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2634 
2635 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2636   if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2637     if (!KMP_MASTER_TID(tid)) {
2638       // Get correct barrier object
2639       itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2640       __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2641     } // (prepare called inside barrier_release)
2642   }
2643 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2644   KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2645                 team->t.t_id, tid));
2646 }
2647 
2648 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2649                           kmp_internal_control_t *new_icvs, ident_t *loc) {
2650   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2651 
2652   KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2653   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2654 
2655 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2656    __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2657    implicit task has this data before this function is called. */
2658 #if KMP_BARRIER_ICV_PULL
2659   /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
2660      remains untouched), where all of the worker threads can access them and
2661      make their own copies after the barrier. */
2662   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2663   // allocated at this point
2664   copy_icvs(
2665       &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2666       new_icvs);
2667   KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2668                 team->t.t_threads[0], team));
2669 #elif KMP_BARRIER_ICV_PUSH
2670   // The ICVs will be propagated in the fork barrier, so nothing needs to be
2671   // done here.
2672   KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2673                 team->t.t_threads[0], team));
2674 #else
2675   // Copy the ICVs to each of the non-primary threads.  This takes O(nthreads)
2676   // time.
2677   ngo_load(new_icvs);
2678   KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2679   // allocated at this point
2680   for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
2681     // TODO: GEH - pass in better source location info since usually NULL here
2682     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2683                   f, team->t.t_threads[f], team));
2684     __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2685     ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2686     KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2687                   f, team->t.t_threads[f], team));
2688   }
2689   ngo_sync();
2690 #endif // KMP_BARRIER_ICV_PULL
2691 }
2692