xref: /freebsd-src/contrib/llvm-project/openmp/runtime/src/kmp_runtime.cpp (revision e8d8bef961a50d4dc22501cde4fb9fb0be1b2532)
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 #if OMPTARGET_PROFILING_SUPPORT
36 #include "llvm/Support/TimeProfiler.h"
37 static char *ProfileTraceFile = nullptr;
38 #endif
39 
40 /* these are temporary issues to be dealt with */
41 #define KMP_USE_PRCTL 0
42 
43 #if KMP_OS_WINDOWS
44 #include <process.h>
45 #endif
46 
47 #include "tsan_annotations.h"
48 
49 #if KMP_OS_WINDOWS
50 // windows does not need include files as it doesn't use shared memory
51 #else
52 #include <sys/mman.h>
53 #include <sys/stat.h>
54 #include <fcntl.h>
55 #define SHM_SIZE 1024
56 #endif
57 
58 #if defined(KMP_GOMP_COMPAT)
59 char const __kmp_version_alt_comp[] =
60     KMP_VERSION_PREFIX "alternative compiler support: yes";
61 #endif /* defined(KMP_GOMP_COMPAT) */
62 
63 char const __kmp_version_omp_api[] =
64     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
65 
66 #ifdef KMP_DEBUG
67 char const __kmp_version_lock[] =
68     KMP_VERSION_PREFIX "lock type: run time selectable";
69 #endif /* KMP_DEBUG */
70 
71 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
72 
73 /* ------------------------------------------------------------------------ */
74 
75 #if KMP_USE_MONITOR
76 kmp_info_t __kmp_monitor;
77 #endif
78 
79 /* Forward declarations */
80 
81 void __kmp_cleanup(void);
82 
83 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
84                                   int gtid);
85 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
86                                   kmp_internal_control_t *new_icvs,
87                                   ident_t *loc);
88 #if KMP_AFFINITY_SUPPORTED
89 static void __kmp_partition_places(kmp_team_t *team,
90                                    int update_master_only = 0);
91 #endif
92 static void __kmp_do_serial_initialize(void);
93 void __kmp_fork_barrier(int gtid, int tid);
94 void __kmp_join_barrier(int gtid);
95 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
96                           kmp_internal_control_t *new_icvs, ident_t *loc);
97 
98 #ifdef USE_LOAD_BALANCE
99 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
100 #endif
101 
102 static int __kmp_expand_threads(int nNeed);
103 #if KMP_OS_WINDOWS
104 static int __kmp_unregister_root_other_thread(int gtid);
105 #endif
106 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
107 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
108 
109 /* Calculate the identifier of the current thread */
110 /* fast (and somewhat portable) way to get unique identifier of executing
111    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
112 int __kmp_get_global_thread_id() {
113   int i;
114   kmp_info_t **other_threads;
115   size_t stack_data;
116   char *stack_addr;
117   size_t stack_size;
118   char *stack_base;
119 
120   KA_TRACE(
121       1000,
122       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
123        __kmp_nth, __kmp_all_nth));
124 
125   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
126      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
127      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
128      __kmp_init_gtid for this to work. */
129 
130   if (!TCR_4(__kmp_init_gtid))
131     return KMP_GTID_DNE;
132 
133 #ifdef KMP_TDATA_GTID
134   if (TCR_4(__kmp_gtid_mode) >= 3) {
135     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
136     return __kmp_gtid;
137   }
138 #endif
139   if (TCR_4(__kmp_gtid_mode) >= 2) {
140     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
141     return __kmp_gtid_get_specific();
142   }
143   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
144 
145   stack_addr = (char *)&stack_data;
146   other_threads = __kmp_threads;
147 
148   /* ATT: The code below is a source of potential bugs due to unsynchronized
149      access to __kmp_threads array. For example:
150      1. Current thread loads other_threads[i] to thr and checks it, it is
151         non-NULL.
152      2. Current thread is suspended by OS.
153      3. Another thread unregisters and finishes (debug versions of free()
154         may fill memory with something like 0xEF).
155      4. Current thread is resumed.
156      5. Current thread reads junk from *thr.
157      TODO: Fix it.  --ln  */
158 
159   for (i = 0; i < __kmp_threads_capacity; i++) {
160 
161     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
162     if (!thr)
163       continue;
164 
165     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
166     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
167 
168     /* stack grows down -- search through all of the active threads */
169 
170     if (stack_addr <= stack_base) {
171       size_t stack_diff = stack_base - stack_addr;
172 
173       if (stack_diff <= stack_size) {
174         /* The only way we can be closer than the allocated */
175         /* stack size is if we are running on this thread. */
176         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
177         return i;
178       }
179     }
180   }
181 
182   /* get specific to try and determine our gtid */
183   KA_TRACE(1000,
184            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
185             "thread, using TLS\n"));
186   i = __kmp_gtid_get_specific();
187 
188   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
189 
190   /* if we havn't been assigned a gtid, then return code */
191   if (i < 0)
192     return i;
193 
194   /* dynamically updated stack window for uber threads to avoid get_specific
195      call */
196   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
197     KMP_FATAL(StackOverflow, i);
198   }
199 
200   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201   if (stack_addr > stack_base) {
202     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
203     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
204             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
205                 stack_base);
206   } else {
207     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
208             stack_base - stack_addr);
209   }
210 
211   /* Reprint stack bounds for ubermaster since they have been refined */
212   if (__kmp_storage_map) {
213     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
214     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
215     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
216                                  other_threads[i]->th.th_info.ds.ds_stacksize,
217                                  "th_%d stack (refinement)", i);
218   }
219   return i;
220 }
221 
222 int __kmp_get_global_thread_id_reg() {
223   int gtid;
224 
225   if (!__kmp_init_serial) {
226     gtid = KMP_GTID_DNE;
227   } else
228 #ifdef KMP_TDATA_GTID
229       if (TCR_4(__kmp_gtid_mode) >= 3) {
230     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
231     gtid = __kmp_gtid;
232   } else
233 #endif
234       if (TCR_4(__kmp_gtid_mode) >= 2) {
235     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
236     gtid = __kmp_gtid_get_specific();
237   } else {
238     KA_TRACE(1000,
239              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
240     gtid = __kmp_get_global_thread_id();
241   }
242 
243   /* we must be a new uber master sibling thread */
244   if (gtid == KMP_GTID_DNE) {
245     KA_TRACE(10,
246              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
247               "Registering a new gtid.\n"));
248     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
249     if (!__kmp_init_serial) {
250       __kmp_do_serial_initialize();
251       gtid = __kmp_gtid_get_specific();
252     } else {
253       gtid = __kmp_register_root(FALSE);
254     }
255     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
256     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
257   }
258 
259   KMP_DEBUG_ASSERT(gtid >= 0);
260 
261   return gtid;
262 }
263 
264 /* caller must hold forkjoin_lock */
265 void __kmp_check_stack_overlap(kmp_info_t *th) {
266   int f;
267   char *stack_beg = NULL;
268   char *stack_end = NULL;
269   int gtid;
270 
271   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
272   if (__kmp_storage_map) {
273     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
274     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
275 
276     gtid = __kmp_gtid_from_thread(th);
277 
278     if (gtid == KMP_GTID_MONITOR) {
279       __kmp_print_storage_map_gtid(
280           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
281           "th_%s stack (%s)", "mon",
282           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
283     } else {
284       __kmp_print_storage_map_gtid(
285           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286           "th_%d stack (%s)", gtid,
287           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288     }
289   }
290 
291   /* No point in checking ubermaster threads since they use refinement and
292    * cannot overlap */
293   gtid = __kmp_gtid_from_thread(th);
294   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
295     KA_TRACE(10,
296              ("__kmp_check_stack_overlap: performing extensive checking\n"));
297     if (stack_beg == NULL) {
298       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
299       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
300     }
301 
302     for (f = 0; f < __kmp_threads_capacity; f++) {
303       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
304 
305       if (f_th && f_th != th) {
306         char *other_stack_end =
307             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
308         char *other_stack_beg =
309             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
310         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
311             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
312 
313           /* Print the other stack values before the abort */
314           if (__kmp_storage_map)
315             __kmp_print_storage_map_gtid(
316                 -1, other_stack_beg, other_stack_end,
317                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
318                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
319 
320           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
321                       __kmp_msg_null);
322         }
323       }
324     }
325   }
326   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
327 }
328 
329 /* ------------------------------------------------------------------------ */
330 
331 void __kmp_infinite_loop(void) {
332   static int done = FALSE;
333 
334   while (!done) {
335     KMP_YIELD(TRUE);
336   }
337 }
338 
339 #define MAX_MESSAGE 512
340 
341 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
342                                   char const *format, ...) {
343   char buffer[MAX_MESSAGE];
344   va_list ap;
345 
346   va_start(ap, format);
347   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
348                p2, (unsigned long)size, format);
349   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
350   __kmp_vprintf(kmp_err, buffer, ap);
351 #if KMP_PRINT_DATA_PLACEMENT
352   int node;
353   if (gtid >= 0) {
354     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
355       if (__kmp_storage_map_verbose) {
356         node = __kmp_get_host_node(p1);
357         if (node < 0) /* doesn't work, so don't try this next time */
358           __kmp_storage_map_verbose = FALSE;
359         else {
360           char *last;
361           int lastNode;
362           int localProc = __kmp_get_cpu_from_gtid(gtid);
363 
364           const int page_size = KMP_GET_PAGE_SIZE();
365 
366           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
367           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
368           if (localProc >= 0)
369             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
370                                  localProc >> 1);
371           else
372             __kmp_printf_no_lock("  GTID %d\n", gtid);
373 #if KMP_USE_PRCTL
374           /* The more elaborate format is disabled for now because of the prctl
375            * hanging bug. */
376           do {
377             last = p1;
378             lastNode = node;
379             /* This loop collates adjacent pages with the same host node. */
380             do {
381               (char *)p1 += page_size;
382             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
383             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
384                                  lastNode);
385           } while (p1 <= p2);
386 #else
387           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
388                                (char *)p1 + (page_size - 1),
389                                __kmp_get_host_node(p1));
390           if (p1 < p2) {
391             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
392                                  (char *)p2 + (page_size - 1),
393                                  __kmp_get_host_node(p2));
394           }
395 #endif
396         }
397       }
398     } else
399       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
400   }
401 #endif /* KMP_PRINT_DATA_PLACEMENT */
402   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
403 }
404 
405 void __kmp_warn(char const *format, ...) {
406   char buffer[MAX_MESSAGE];
407   va_list ap;
408 
409   if (__kmp_generate_warnings == kmp_warnings_off) {
410     return;
411   }
412 
413   va_start(ap, format);
414 
415   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
416   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
417   __kmp_vprintf(kmp_err, buffer, ap);
418   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
419 
420   va_end(ap);
421 }
422 
423 void __kmp_abort_process() {
424   // Later threads may stall here, but that's ok because abort() will kill them.
425   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
426 
427   if (__kmp_debug_buf) {
428     __kmp_dump_debug_buffer();
429   }
430 
431   if (KMP_OS_WINDOWS) {
432     // Let other threads know of abnormal termination and prevent deadlock
433     // if abort happened during library initialization or shutdown
434     __kmp_global.g.g_abort = SIGABRT;
435 
436     /* On Windows* OS by default abort() causes pop-up error box, which stalls
437        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
438        boxes. _set_abort_behavior() works well, but this function is not
439        available in VS7 (this is not problem for DLL, but it is a problem for
440        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
441        help, at least in some versions of MS C RTL.
442 
443        It seems following sequence is the only way to simulate abort() and
444        avoid pop-up error box. */
445     raise(SIGABRT);
446     _exit(3); // Just in case, if signal ignored, exit anyway.
447   } else {
448     __kmp_unregister_library();
449     abort();
450   }
451 
452   __kmp_infinite_loop();
453   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
454 
455 } // __kmp_abort_process
456 
457 void __kmp_abort_thread(void) {
458   // TODO: Eliminate g_abort global variable and this function.
459   // In case of abort just call abort(), it will kill all the threads.
460   __kmp_infinite_loop();
461 } // __kmp_abort_thread
462 
463 /* Print out the storage map for the major kmp_info_t thread data structures
464    that are allocated together. */
465 
466 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
467   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
468                                gtid);
469 
470   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
471                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
472 
473   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
474                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
475 
476   __kmp_print_storage_map_gtid(
477       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
478       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
479 
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
481                                &thr->th.th_bar[bs_plain_barrier + 1],
482                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
483                                gtid);
484 
485   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
486                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
487                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
488                                gtid);
489 
490 #if KMP_FAST_REDUCTION_BARRIER
491   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
492                                &thr->th.th_bar[bs_reduction_barrier + 1],
493                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
494                                gtid);
495 #endif // KMP_FAST_REDUCTION_BARRIER
496 }
497 
498 /* Print out the storage map for the major kmp_team_t team data structures
499    that are allocated together. */
500 
501 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
502                                          int team_id, int num_thr) {
503   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
504   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
505                                header, team_id);
506 
507   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
508                                &team->t.t_bar[bs_last_barrier],
509                                sizeof(kmp_balign_team_t) * bs_last_barrier,
510                                "%s_%d.t_bar", header, team_id);
511 
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
513                                &team->t.t_bar[bs_plain_barrier + 1],
514                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
515                                header, team_id);
516 
517   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
518                                &team->t.t_bar[bs_forkjoin_barrier + 1],
519                                sizeof(kmp_balign_team_t),
520                                "%s_%d.t_bar[forkjoin]", header, team_id);
521 
522 #if KMP_FAST_REDUCTION_BARRIER
523   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
524                                &team->t.t_bar[bs_reduction_barrier + 1],
525                                sizeof(kmp_balign_team_t),
526                                "%s_%d.t_bar[reduction]", header, team_id);
527 #endif // KMP_FAST_REDUCTION_BARRIER
528 
529   __kmp_print_storage_map_gtid(
530       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
531       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
532 
533   __kmp_print_storage_map_gtid(
534       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
535       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
536 
537   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
538                                &team->t.t_disp_buffer[num_disp_buff],
539                                sizeof(dispatch_shared_info_t) * num_disp_buff,
540                                "%s_%d.t_disp_buffer", header, team_id);
541 }
542 
543 static void __kmp_init_allocator() { __kmp_init_memkind(); }
544 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
545 
546 /* ------------------------------------------------------------------------ */
547 
548 #if KMP_DYNAMIC_LIB
549 #if KMP_OS_WINDOWS
550 
551 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
552   // TODO: Change to __kmp_break_bootstrap_lock().
553   __kmp_init_bootstrap_lock(lck); // make the lock released
554 }
555 
556 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
557   int i;
558   int thread_count;
559 
560   // PROCESS_DETACH is expected to be called by a thread that executes
561   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
562   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
563   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
564   // threads can be still alive here, although being about to be terminated. The
565   // threads in the array with ds_thread==0 are most suspicious. Actually, it
566   // can be not safe to access the __kmp_threads[].
567 
568   // TODO: does it make sense to check __kmp_roots[] ?
569 
570   // Let's check that there are no other alive threads registered with the OMP
571   // lib.
572   while (1) {
573     thread_count = 0;
574     for (i = 0; i < __kmp_threads_capacity; ++i) {
575       if (!__kmp_threads)
576         continue;
577       kmp_info_t *th = __kmp_threads[i];
578       if (th == NULL)
579         continue;
580       int gtid = th->th.th_info.ds.ds_gtid;
581       if (gtid == gtid_req)
582         continue;
583       if (gtid < 0)
584         continue;
585       DWORD exit_val;
586       int alive = __kmp_is_thread_alive(th, &exit_val);
587       if (alive) {
588         ++thread_count;
589       }
590     }
591     if (thread_count == 0)
592       break; // success
593   }
594 
595   // Assume that I'm alone. Now it might be safe to check and reset locks.
596   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
597   __kmp_reset_lock(&__kmp_forkjoin_lock);
598 #ifdef KMP_DEBUG
599   __kmp_reset_lock(&__kmp_stdio_lock);
600 #endif // KMP_DEBUG
601 }
602 
603 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
604   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
605 
606   switch (fdwReason) {
607 
608   case DLL_PROCESS_ATTACH:
609     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
610 
611     return TRUE;
612 
613   case DLL_PROCESS_DETACH:
614     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
615 
616     if (lpReserved != NULL) {
617       // lpReserved is used for telling the difference:
618       //   lpReserved == NULL when FreeLibrary() was called,
619       //   lpReserved != NULL when the process terminates.
620       // When FreeLibrary() is called, worker threads remain alive. So they will
621       // release the forkjoin lock by themselves. When the process terminates,
622       // worker threads disappear triggering the problem of unreleased forkjoin
623       // lock as described below.
624 
625       // A worker thread can take the forkjoin lock. The problem comes up if
626       // that worker thread becomes dead before it releases the forkjoin lock.
627       // The forkjoin lock remains taken, while the thread executing
628       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
629       // to take the forkjoin lock and will always fail, so that the application
630       // will never finish [normally]. This scenario is possible if
631       // __kmpc_end() has not been executed. It looks like it's not a corner
632       // case, but common cases:
633       // - the main function was compiled by an alternative compiler;
634       // - the main function was compiled by icl but without /Qopenmp
635       //   (application with plugins);
636       // - application terminates by calling C exit(), Fortran CALL EXIT() or
637       //   Fortran STOP.
638       // - alive foreign thread prevented __kmpc_end from doing cleanup.
639       //
640       // This is a hack to work around the problem.
641       // TODO: !!! figure out something better.
642       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
643     }
644 
645     __kmp_internal_end_library(__kmp_gtid_get_specific());
646 
647     return TRUE;
648 
649   case DLL_THREAD_ATTACH:
650     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
651 
652     /* if we want to register new siblings all the time here call
653      * __kmp_get_gtid(); */
654     return TRUE;
655 
656   case DLL_THREAD_DETACH:
657     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
658 
659     __kmp_internal_end_thread(__kmp_gtid_get_specific());
660     return TRUE;
661   }
662 
663   return TRUE;
664 }
665 
666 #endif /* KMP_OS_WINDOWS */
667 #endif /* KMP_DYNAMIC_LIB */
668 
669 /* __kmp_parallel_deo -- Wait until it's our turn. */
670 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
671   int gtid = *gtid_ref;
672 #ifdef BUILD_PARALLEL_ORDERED
673   kmp_team_t *team = __kmp_team_from_gtid(gtid);
674 #endif /* BUILD_PARALLEL_ORDERED */
675 
676   if (__kmp_env_consistency_check) {
677     if (__kmp_threads[gtid]->th.th_root->r.r_active)
678 #if KMP_USE_DYNAMIC_LOCK
679       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
680 #else
681       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
682 #endif
683   }
684 #ifdef BUILD_PARALLEL_ORDERED
685   if (!team->t.t_serialized) {
686     KMP_MB();
687     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
688              NULL);
689     KMP_MB();
690   }
691 #endif /* BUILD_PARALLEL_ORDERED */
692 }
693 
694 /* __kmp_parallel_dxo -- Signal the next task. */
695 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
696   int gtid = *gtid_ref;
697 #ifdef BUILD_PARALLEL_ORDERED
698   int tid = __kmp_tid_from_gtid(gtid);
699   kmp_team_t *team = __kmp_team_from_gtid(gtid);
700 #endif /* BUILD_PARALLEL_ORDERED */
701 
702   if (__kmp_env_consistency_check) {
703     if (__kmp_threads[gtid]->th.th_root->r.r_active)
704       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
705   }
706 #ifdef BUILD_PARALLEL_ORDERED
707   if (!team->t.t_serialized) {
708     KMP_MB(); /* Flush all pending memory write invalidates.  */
709 
710     /* use the tid of the next thread in this team */
711     /* TODO replace with general release procedure */
712     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
713 
714     KMP_MB(); /* Flush all pending memory write invalidates.  */
715   }
716 #endif /* BUILD_PARALLEL_ORDERED */
717 }
718 
719 /* ------------------------------------------------------------------------ */
720 /* The BARRIER for a SINGLE process section is always explicit   */
721 
722 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
723   int status;
724   kmp_info_t *th;
725   kmp_team_t *team;
726 
727   if (!TCR_4(__kmp_init_parallel))
728     __kmp_parallel_initialize();
729   __kmp_resume_if_soft_paused();
730 
731   th = __kmp_threads[gtid];
732   team = th->th.th_team;
733   status = 0;
734 
735   th->th.th_ident = id_ref;
736 
737   if (team->t.t_serialized) {
738     status = 1;
739   } else {
740     kmp_int32 old_this = th->th.th_local.this_construct;
741 
742     ++th->th.th_local.this_construct;
743     /* try to set team count to thread count--success means thread got the
744        single block */
745     /* TODO: Should this be acquire or release? */
746     if (team->t.t_construct == old_this) {
747       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
748                                               th->th.th_local.this_construct);
749     }
750 #if USE_ITT_BUILD
751     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
752         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
753         team->t.t_active_level ==
754             1) { // Only report metadata by master of active team at level 1
755       __kmp_itt_metadata_single(id_ref);
756     }
757 #endif /* USE_ITT_BUILD */
758   }
759 
760   if (__kmp_env_consistency_check) {
761     if (status && push_ws) {
762       __kmp_push_workshare(gtid, ct_psingle, id_ref);
763     } else {
764       __kmp_check_workshare(gtid, ct_psingle, id_ref);
765     }
766   }
767 #if USE_ITT_BUILD
768   if (status) {
769     __kmp_itt_single_start(gtid);
770   }
771 #endif /* USE_ITT_BUILD */
772   return status;
773 }
774 
775 void __kmp_exit_single(int gtid) {
776 #if USE_ITT_BUILD
777   __kmp_itt_single_end(gtid);
778 #endif /* USE_ITT_BUILD */
779   if (__kmp_env_consistency_check)
780     __kmp_pop_workshare(gtid, ct_psingle, NULL);
781 }
782 
783 /* determine if we can go parallel or must use a serialized parallel region and
784  * how many threads we can use
785  * set_nproc is the number of threads requested for the team
786  * returns 0 if we should serialize or only use one thread,
787  * otherwise the number of threads to use
788  * The forkjoin lock is held by the caller. */
789 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
790                                  int master_tid, int set_nthreads,
791                                  int enter_teams) {
792   int capacity;
793   int new_nthreads;
794   KMP_DEBUG_ASSERT(__kmp_init_serial);
795   KMP_DEBUG_ASSERT(root && parent_team);
796   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
797 
798   // If dyn-var is set, dynamically adjust the number of desired threads,
799   // according to the method specified by dynamic_mode.
800   new_nthreads = set_nthreads;
801   if (!get__dynamic_2(parent_team, master_tid)) {
802     ;
803   }
804 #ifdef USE_LOAD_BALANCE
805   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
806     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
807     if (new_nthreads == 1) {
808       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
809                     "reservation to 1 thread\n",
810                     master_tid));
811       return 1;
812     }
813     if (new_nthreads < set_nthreads) {
814       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
815                     "reservation to %d threads\n",
816                     master_tid, new_nthreads));
817     }
818   }
819 #endif /* USE_LOAD_BALANCE */
820   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
821     new_nthreads = __kmp_avail_proc - __kmp_nth +
822                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
823     if (new_nthreads <= 1) {
824       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
825                     "reservation to 1 thread\n",
826                     master_tid));
827       return 1;
828     }
829     if (new_nthreads < set_nthreads) {
830       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
831                     "reservation to %d threads\n",
832                     master_tid, new_nthreads));
833     } else {
834       new_nthreads = set_nthreads;
835     }
836   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
837     if (set_nthreads > 2) {
838       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
839       new_nthreads = (new_nthreads % set_nthreads) + 1;
840       if (new_nthreads == 1) {
841         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
842                       "reservation to 1 thread\n",
843                       master_tid));
844         return 1;
845       }
846       if (new_nthreads < set_nthreads) {
847         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
848                       "reservation to %d threads\n",
849                       master_tid, new_nthreads));
850       }
851     }
852   } else {
853     KMP_ASSERT(0);
854   }
855 
856   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
857   if (__kmp_nth + new_nthreads -
858           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
859       __kmp_max_nth) {
860     int tl_nthreads = __kmp_max_nth - __kmp_nth +
861                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
862     if (tl_nthreads <= 0) {
863       tl_nthreads = 1;
864     }
865 
866     // If dyn-var is false, emit a 1-time warning.
867     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
868       __kmp_reserve_warn = 1;
869       __kmp_msg(kmp_ms_warning,
870                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
871                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
872     }
873     if (tl_nthreads == 1) {
874       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
875                     "reduced reservation to 1 thread\n",
876                     master_tid));
877       return 1;
878     }
879     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
880                   "reservation to %d threads\n",
881                   master_tid, tl_nthreads));
882     new_nthreads = tl_nthreads;
883   }
884 
885   // Respect OMP_THREAD_LIMIT
886   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
887   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
888   if (cg_nthreads + new_nthreads -
889           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
890       max_cg_threads) {
891     int tl_nthreads = max_cg_threads - cg_nthreads +
892                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
893     if (tl_nthreads <= 0) {
894       tl_nthreads = 1;
895     }
896 
897     // If dyn-var is false, emit a 1-time warning.
898     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
899       __kmp_reserve_warn = 1;
900       __kmp_msg(kmp_ms_warning,
901                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
902                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
903     }
904     if (tl_nthreads == 1) {
905       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
906                     "reduced reservation to 1 thread\n",
907                     master_tid));
908       return 1;
909     }
910     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
911                   "reservation to %d threads\n",
912                   master_tid, tl_nthreads));
913     new_nthreads = tl_nthreads;
914   }
915 
916   // Check if the threads array is large enough, or needs expanding.
917   // See comment in __kmp_register_root() about the adjustment if
918   // __kmp_threads[0] == NULL.
919   capacity = __kmp_threads_capacity;
920   if (TCR_PTR(__kmp_threads[0]) == NULL) {
921     --capacity;
922   }
923   if (__kmp_nth + new_nthreads -
924           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
925       capacity) {
926     // Expand the threads array.
927     int slotsRequired = __kmp_nth + new_nthreads -
928                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
929                         capacity;
930     int slotsAdded = __kmp_expand_threads(slotsRequired);
931     if (slotsAdded < slotsRequired) {
932       // The threads array was not expanded enough.
933       new_nthreads -= (slotsRequired - slotsAdded);
934       KMP_ASSERT(new_nthreads >= 1);
935 
936       // If dyn-var is false, emit a 1-time warning.
937       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
938         __kmp_reserve_warn = 1;
939         if (__kmp_tp_cached) {
940           __kmp_msg(kmp_ms_warning,
941                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
942                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
943                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
944         } else {
945           __kmp_msg(kmp_ms_warning,
946                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
947                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
948         }
949       }
950     }
951   }
952 
953 #ifdef KMP_DEBUG
954   if (new_nthreads == 1) {
955     KC_TRACE(10,
956              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
957               "dead roots and rechecking; requested %d threads\n",
958               __kmp_get_gtid(), set_nthreads));
959   } else {
960     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
961                   " %d threads\n",
962                   __kmp_get_gtid(), new_nthreads, set_nthreads));
963   }
964 #endif // KMP_DEBUG
965   return new_nthreads;
966 }
967 
968 /* Allocate threads from the thread pool and assign them to the new team. We are
969    assured that there are enough threads available, because we checked on that
970    earlier within critical section forkjoin */
971 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
972                                     kmp_info_t *master_th, int master_gtid) {
973   int i;
974   int use_hot_team;
975 
976   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
977   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
978   KMP_MB();
979 
980   /* first, let's setup the master thread */
981   master_th->th.th_info.ds.ds_tid = 0;
982   master_th->th.th_team = team;
983   master_th->th.th_team_nproc = team->t.t_nproc;
984   master_th->th.th_team_master = master_th;
985   master_th->th.th_team_serialized = FALSE;
986   master_th->th.th_dispatch = &team->t.t_dispatch[0];
987 
988 /* make sure we are not the optimized hot team */
989 #if KMP_NESTED_HOT_TEAMS
990   use_hot_team = 0;
991   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
992   if (hot_teams) { // hot teams array is not allocated if
993     // KMP_HOT_TEAMS_MAX_LEVEL=0
994     int level = team->t.t_active_level - 1; // index in array of hot teams
995     if (master_th->th.th_teams_microtask) { // are we inside the teams?
996       if (master_th->th.th_teams_size.nteams > 1) {
997         ++level; // level was not increased in teams construct for
998         // team_of_masters
999       }
1000       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1001           master_th->th.th_teams_level == team->t.t_level) {
1002         ++level; // level was not increased in teams construct for
1003         // team_of_workers before the parallel
1004       } // team->t.t_level will be increased inside parallel
1005     }
1006     if (level < __kmp_hot_teams_max_level) {
1007       if (hot_teams[level].hot_team) {
1008         // hot team has already been allocated for given level
1009         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1010         use_hot_team = 1; // the team is ready to use
1011       } else {
1012         use_hot_team = 0; // AC: threads are not allocated yet
1013         hot_teams[level].hot_team = team; // remember new hot team
1014         hot_teams[level].hot_team_nth = team->t.t_nproc;
1015       }
1016     } else {
1017       use_hot_team = 0;
1018     }
1019   }
1020 #else
1021   use_hot_team = team == root->r.r_hot_team;
1022 #endif
1023   if (!use_hot_team) {
1024 
1025     /* install the master thread */
1026     team->t.t_threads[0] = master_th;
1027     __kmp_initialize_info(master_th, team, 0, master_gtid);
1028 
1029     /* now, install the worker threads */
1030     for (i = 1; i < team->t.t_nproc; i++) {
1031 
1032       /* fork or reallocate a new thread and install it in team */
1033       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1034       team->t.t_threads[i] = thr;
1035       KMP_DEBUG_ASSERT(thr);
1036       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1037       /* align team and thread arrived states */
1038       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1039                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1040                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1041                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1042                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1043                     team->t.t_bar[bs_plain_barrier].b_arrived));
1044       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1045       thr->th.th_teams_level = master_th->th.th_teams_level;
1046       thr->th.th_teams_size = master_th->th.th_teams_size;
1047       { // Initialize threads' barrier data.
1048         int b;
1049         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1050         for (b = 0; b < bs_last_barrier; ++b) {
1051           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1052           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1053 #if USE_DEBUGGER
1054           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1055 #endif
1056         }
1057       }
1058     }
1059 
1060 #if KMP_AFFINITY_SUPPORTED
1061     __kmp_partition_places(team);
1062 #endif
1063   }
1064 
1065   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1066     for (i = 0; i < team->t.t_nproc; i++) {
1067       kmp_info_t *thr = team->t.t_threads[i];
1068       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1069           thr->th.th_prev_level != team->t.t_level) {
1070         team->t.t_display_affinity = 1;
1071         break;
1072       }
1073     }
1074   }
1075 
1076   KMP_MB();
1077 }
1078 
1079 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1080 // Propagate any changes to the floating point control registers out to the team
1081 // We try to avoid unnecessary writes to the relevant cache line in the team
1082 // structure, so we don't make changes unless they are needed.
1083 inline static void propagateFPControl(kmp_team_t *team) {
1084   if (__kmp_inherit_fp_control) {
1085     kmp_int16 x87_fpu_control_word;
1086     kmp_uint32 mxcsr;
1087 
1088     // Get master values of FPU control flags (both X87 and vector)
1089     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1090     __kmp_store_mxcsr(&mxcsr);
1091     mxcsr &= KMP_X86_MXCSR_MASK;
1092 
1093     // There is no point looking at t_fp_control_saved here.
1094     // If it is TRUE, we still have to update the values if they are different
1095     // from those we now have. If it is FALSE we didn't save anything yet, but
1096     // our objective is the same. We have to ensure that the values in the team
1097     // are the same as those we have.
1098     // So, this code achieves what we need whether or not t_fp_control_saved is
1099     // true. By checking whether the value needs updating we avoid unnecessary
1100     // writes that would put the cache-line into a written state, causing all
1101     // threads in the team to have to read it again.
1102     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1103     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1104     // Although we don't use this value, other code in the runtime wants to know
1105     // whether it should restore them. So we must ensure it is correct.
1106     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1107   } else {
1108     // Similarly here. Don't write to this cache-line in the team structure
1109     // unless we have to.
1110     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1111   }
1112 }
1113 
1114 // Do the opposite, setting the hardware registers to the updated values from
1115 // the team.
1116 inline static void updateHWFPControl(kmp_team_t *team) {
1117   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1118     // Only reset the fp control regs if they have been changed in the team.
1119     // the parallel region that we are exiting.
1120     kmp_int16 x87_fpu_control_word;
1121     kmp_uint32 mxcsr;
1122     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1123     __kmp_store_mxcsr(&mxcsr);
1124     mxcsr &= KMP_X86_MXCSR_MASK;
1125 
1126     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1127       __kmp_clear_x87_fpu_status_word();
1128       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1129     }
1130 
1131     if (team->t.t_mxcsr != mxcsr) {
1132       __kmp_load_mxcsr(&team->t.t_mxcsr);
1133     }
1134   }
1135 }
1136 #else
1137 #define propagateFPControl(x) ((void)0)
1138 #define updateHWFPControl(x) ((void)0)
1139 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1140 
1141 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1142                                      int realloc); // forward declaration
1143 
1144 /* Run a parallel region that has been serialized, so runs only in a team of the
1145    single master thread. */
1146 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1147   kmp_info_t *this_thr;
1148   kmp_team_t *serial_team;
1149 
1150   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1151 
1152   /* Skip all this code for autopar serialized loops since it results in
1153      unacceptable overhead */
1154   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1155     return;
1156 
1157   if (!TCR_4(__kmp_init_parallel))
1158     __kmp_parallel_initialize();
1159   __kmp_resume_if_soft_paused();
1160 
1161   this_thr = __kmp_threads[global_tid];
1162   serial_team = this_thr->th.th_serial_team;
1163 
1164   /* utilize the serialized team held by this thread */
1165   KMP_DEBUG_ASSERT(serial_team);
1166   KMP_MB();
1167 
1168   if (__kmp_tasking_mode != tskm_immediate_exec) {
1169     KMP_DEBUG_ASSERT(
1170         this_thr->th.th_task_team ==
1171         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1172     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1173                      NULL);
1174     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1175                   "team %p, new task_team = NULL\n",
1176                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1177     this_thr->th.th_task_team = NULL;
1178   }
1179 
1180   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1181   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1182     proc_bind = proc_bind_false;
1183   } else if (proc_bind == proc_bind_default) {
1184     // No proc_bind clause was specified, so use the current value
1185     // of proc-bind-var for this parallel region.
1186     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1187   }
1188   // Reset for next parallel region
1189   this_thr->th.th_set_proc_bind = proc_bind_default;
1190 
1191 #if OMPT_SUPPORT
1192   ompt_data_t ompt_parallel_data = ompt_data_none;
1193   ompt_data_t *implicit_task_data;
1194   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1195   if (ompt_enabled.enabled &&
1196       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1197 
1198     ompt_task_info_t *parent_task_info;
1199     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1200 
1201     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1202     if (ompt_enabled.ompt_callback_parallel_begin) {
1203       int team_size = 1;
1204 
1205       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1206           &(parent_task_info->task_data), &(parent_task_info->frame),
1207           &ompt_parallel_data, team_size,
1208           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1209     }
1210   }
1211 #endif // OMPT_SUPPORT
1212 
1213   if (this_thr->th.th_team != serial_team) {
1214     // Nested level will be an index in the nested nthreads array
1215     int level = this_thr->th.th_team->t.t_level;
1216 
1217     if (serial_team->t.t_serialized) {
1218       /* this serial team was already used
1219          TODO increase performance by making this locks more specific */
1220       kmp_team_t *new_team;
1221 
1222       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1223 
1224       new_team =
1225           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1226 #if OMPT_SUPPORT
1227                               ompt_parallel_data,
1228 #endif
1229                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1230                               0 USE_NESTED_HOT_ARG(NULL));
1231       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1232       KMP_ASSERT(new_team);
1233 
1234       /* setup new serialized team and install it */
1235       new_team->t.t_threads[0] = this_thr;
1236       new_team->t.t_parent = this_thr->th.th_team;
1237       serial_team = new_team;
1238       this_thr->th.th_serial_team = serial_team;
1239 
1240       KF_TRACE(
1241           10,
1242           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1243            global_tid, serial_team));
1244 
1245       /* TODO the above breaks the requirement that if we run out of resources,
1246          then we can still guarantee that serialized teams are ok, since we may
1247          need to allocate a new one */
1248     } else {
1249       KF_TRACE(
1250           10,
1251           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1252            global_tid, serial_team));
1253     }
1254 
1255     /* we have to initialize this serial team */
1256     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1257     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1258     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1259     serial_team->t.t_ident = loc;
1260     serial_team->t.t_serialized = 1;
1261     serial_team->t.t_nproc = 1;
1262     serial_team->t.t_parent = this_thr->th.th_team;
1263     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1264     this_thr->th.th_team = serial_team;
1265     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1266 
1267     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1268                   this_thr->th.th_current_task));
1269     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1270     this_thr->th.th_current_task->td_flags.executing = 0;
1271 
1272     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1273 
1274     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1275        implicit task for each serialized task represented by
1276        team->t.t_serialized? */
1277     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1278               &this_thr->th.th_current_task->td_parent->td_icvs);
1279 
1280     // Thread value exists in the nested nthreads array for the next nested
1281     // level
1282     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1283       this_thr->th.th_current_task->td_icvs.nproc =
1284           __kmp_nested_nth.nth[level + 1];
1285     }
1286 
1287     if (__kmp_nested_proc_bind.used &&
1288         (level + 1 < __kmp_nested_proc_bind.used)) {
1289       this_thr->th.th_current_task->td_icvs.proc_bind =
1290           __kmp_nested_proc_bind.bind_types[level + 1];
1291     }
1292 
1293 #if USE_DEBUGGER
1294     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1295 #endif
1296     this_thr->th.th_info.ds.ds_tid = 0;
1297 
1298     /* set thread cache values */
1299     this_thr->th.th_team_nproc = 1;
1300     this_thr->th.th_team_master = this_thr;
1301     this_thr->th.th_team_serialized = 1;
1302 
1303     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1304     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1305     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1306 
1307     propagateFPControl(serial_team);
1308 
1309     /* check if we need to allocate dispatch buffers stack */
1310     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1311     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1312       serial_team->t.t_dispatch->th_disp_buffer =
1313           (dispatch_private_info_t *)__kmp_allocate(
1314               sizeof(dispatch_private_info_t));
1315     }
1316     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1317 
1318     KMP_MB();
1319 
1320   } else {
1321     /* this serialized team is already being used,
1322      * that's fine, just add another nested level */
1323     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1324     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1325     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1326     ++serial_team->t.t_serialized;
1327     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1328 
1329     // Nested level will be an index in the nested nthreads array
1330     int level = this_thr->th.th_team->t.t_level;
1331     // Thread value exists in the nested nthreads array for the next nested
1332     // level
1333     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1334       this_thr->th.th_current_task->td_icvs.nproc =
1335           __kmp_nested_nth.nth[level + 1];
1336     }
1337     serial_team->t.t_level++;
1338     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1339                   "of serial team %p to %d\n",
1340                   global_tid, serial_team, serial_team->t.t_level));
1341 
1342     /* allocate/push dispatch buffers stack */
1343     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1344     {
1345       dispatch_private_info_t *disp_buffer =
1346           (dispatch_private_info_t *)__kmp_allocate(
1347               sizeof(dispatch_private_info_t));
1348       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1349       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1350     }
1351     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1352 
1353     KMP_MB();
1354   }
1355   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1356 
1357   // Perform the display affinity functionality for
1358   // serialized parallel regions
1359   if (__kmp_display_affinity) {
1360     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1361         this_thr->th.th_prev_num_threads != 1) {
1362       // NULL means use the affinity-format-var ICV
1363       __kmp_aux_display_affinity(global_tid, NULL);
1364       this_thr->th.th_prev_level = serial_team->t.t_level;
1365       this_thr->th.th_prev_num_threads = 1;
1366     }
1367   }
1368 
1369   if (__kmp_env_consistency_check)
1370     __kmp_push_parallel(global_tid, NULL);
1371 #if OMPT_SUPPORT
1372   serial_team->t.ompt_team_info.master_return_address = codeptr;
1373   if (ompt_enabled.enabled &&
1374       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1375     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1376 
1377     ompt_lw_taskteam_t lw_taskteam;
1378     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1379                             &ompt_parallel_data, codeptr);
1380 
1381     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1382     // don't use lw_taskteam after linking. content was swaped
1383 
1384     /* OMPT implicit task begin */
1385     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1386     if (ompt_enabled.ompt_callback_implicit_task) {
1387       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1388           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1389           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1390       OMPT_CUR_TASK_INFO(this_thr)
1391           ->thread_num = __kmp_tid_from_gtid(global_tid);
1392     }
1393 
1394     /* OMPT state */
1395     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1396     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1397   }
1398 #endif
1399 }
1400 
1401 /* most of the work for a fork */
1402 /* return true if we really went parallel, false if serialized */
1403 int __kmp_fork_call(ident_t *loc, int gtid,
1404                     enum fork_context_e call_context, // Intel, GNU, ...
1405                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1406                     kmp_va_list ap) {
1407   void **argv;
1408   int i;
1409   int master_tid;
1410   int master_this_cons;
1411   kmp_team_t *team;
1412   kmp_team_t *parent_team;
1413   kmp_info_t *master_th;
1414   kmp_root_t *root;
1415   int nthreads;
1416   int master_active;
1417   int master_set_numthreads;
1418   int level;
1419   int active_level;
1420   int teams_level;
1421 #if KMP_NESTED_HOT_TEAMS
1422   kmp_hot_team_ptr_t **p_hot_teams;
1423 #endif
1424   { // KMP_TIME_BLOCK
1425     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1426     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1427 
1428     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1429     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1430       /* Some systems prefer the stack for the root thread(s) to start with */
1431       /* some gap from the parent stack to prevent false sharing. */
1432       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1433       /* These 2 lines below are so this does not get optimized out */
1434       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1435         __kmp_stkpadding += (short)((kmp_int64)dummy);
1436     }
1437 
1438     /* initialize if needed */
1439     KMP_DEBUG_ASSERT(
1440         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1441     if (!TCR_4(__kmp_init_parallel))
1442       __kmp_parallel_initialize();
1443     __kmp_resume_if_soft_paused();
1444 
1445     /* setup current data */
1446     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1447     // shutdown
1448     parent_team = master_th->th.th_team;
1449     master_tid = master_th->th.th_info.ds.ds_tid;
1450     master_this_cons = master_th->th.th_local.this_construct;
1451     root = master_th->th.th_root;
1452     master_active = root->r.r_active;
1453     master_set_numthreads = master_th->th.th_set_nproc;
1454 
1455 #if OMPT_SUPPORT
1456     ompt_data_t ompt_parallel_data = ompt_data_none;
1457     ompt_data_t *parent_task_data;
1458     ompt_frame_t *ompt_frame;
1459     ompt_data_t *implicit_task_data;
1460     void *return_address = NULL;
1461 
1462     if (ompt_enabled.enabled) {
1463       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1464                                     NULL, NULL);
1465       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1466     }
1467 #endif
1468 
1469     // Nested level will be an index in the nested nthreads array
1470     level = parent_team->t.t_level;
1471     // used to launch non-serial teams even if nested is not allowed
1472     active_level = parent_team->t.t_active_level;
1473     // needed to check nesting inside the teams
1474     teams_level = master_th->th.th_teams_level;
1475 #if KMP_NESTED_HOT_TEAMS
1476     p_hot_teams = &master_th->th.th_hot_teams;
1477     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1478       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1479           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1480       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1481       // it is either actual or not needed (when active_level > 0)
1482       (*p_hot_teams)[0].hot_team_nth = 1;
1483     }
1484 #endif
1485 
1486 #if OMPT_SUPPORT
1487     if (ompt_enabled.enabled) {
1488       if (ompt_enabled.ompt_callback_parallel_begin) {
1489         int team_size = master_set_numthreads
1490                             ? master_set_numthreads
1491                             : get__nproc_2(parent_team, master_tid);
1492         int flags = OMPT_INVOKER(call_context) |
1493                     ((microtask == (microtask_t)__kmp_teams_master)
1494                          ? ompt_parallel_league
1495                          : ompt_parallel_team);
1496         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1497             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1498             return_address);
1499       }
1500       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1501     }
1502 #endif
1503 
1504     master_th->th.th_ident = loc;
1505 
1506     if (master_th->th.th_teams_microtask && ap &&
1507         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1508       // AC: This is start of parallel that is nested inside teams construct.
1509       // The team is actual (hot), all workers are ready at the fork barrier.
1510       // No lock needed to initialize the team a bit, then free workers.
1511       parent_team->t.t_ident = loc;
1512       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1513       parent_team->t.t_argc = argc;
1514       argv = (void **)parent_team->t.t_argv;
1515       for (i = argc - 1; i >= 0; --i)
1516         *argv++ = va_arg(kmp_va_deref(ap), void *);
1517       // Increment our nested depth levels, but not increase the serialization
1518       if (parent_team == master_th->th.th_serial_team) {
1519         // AC: we are in serialized parallel
1520         __kmpc_serialized_parallel(loc, gtid);
1521         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1522 
1523         if (call_context == fork_context_gnu) {
1524           // AC: need to decrement t_serialized for enquiry functions to work
1525           // correctly, will restore at join time
1526           parent_team->t.t_serialized--;
1527           return TRUE;
1528         }
1529 
1530 #if OMPT_SUPPORT
1531         void *dummy;
1532         void **exit_frame_p;
1533 
1534         ompt_lw_taskteam_t lw_taskteam;
1535 
1536         if (ompt_enabled.enabled) {
1537           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1538                                   &ompt_parallel_data, return_address);
1539           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1540 
1541           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1542           // don't use lw_taskteam after linking. content was swaped
1543 
1544           /* OMPT implicit task begin */
1545           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1546           if (ompt_enabled.ompt_callback_implicit_task) {
1547             OMPT_CUR_TASK_INFO(master_th)
1548                 ->thread_num = __kmp_tid_from_gtid(gtid);
1549             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1550                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1551                 implicit_task_data, 1,
1552                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1553           }
1554 
1555           /* OMPT state */
1556           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1557         } else {
1558           exit_frame_p = &dummy;
1559         }
1560 #endif
1561         // AC: need to decrement t_serialized for enquiry functions to work
1562         // correctly, will restore at join time
1563         parent_team->t.t_serialized--;
1564 
1565         {
1566           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1567           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1568           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1569 #if OMPT_SUPPORT
1570                                  ,
1571                                  exit_frame_p
1572 #endif
1573                                  );
1574         }
1575 
1576 #if OMPT_SUPPORT
1577         if (ompt_enabled.enabled) {
1578           *exit_frame_p = NULL;
1579           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1580           if (ompt_enabled.ompt_callback_implicit_task) {
1581             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1582                 ompt_scope_end, NULL, implicit_task_data, 1,
1583                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1584           }
1585           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1586           __ompt_lw_taskteam_unlink(master_th);
1587           if (ompt_enabled.ompt_callback_parallel_end) {
1588             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1589                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1590                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1591                 return_address);
1592           }
1593           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1594         }
1595 #endif
1596         return TRUE;
1597       }
1598 
1599       parent_team->t.t_pkfn = microtask;
1600       parent_team->t.t_invoke = invoker;
1601       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1602       parent_team->t.t_active_level++;
1603       parent_team->t.t_level++;
1604       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1605 
1606 #if OMPT_SUPPORT
1607       if (ompt_enabled.enabled) {
1608         ompt_lw_taskteam_t lw_taskteam;
1609         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1610                                 &ompt_parallel_data, return_address);
1611         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1612       }
1613 #endif
1614 
1615       /* Change number of threads in the team if requested */
1616       if (master_set_numthreads) { // The parallel has num_threads clause
1617         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1618           // AC: only can reduce number of threads dynamically, can't increase
1619           kmp_info_t **other_threads = parent_team->t.t_threads;
1620           parent_team->t.t_nproc = master_set_numthreads;
1621           for (i = 0; i < master_set_numthreads; ++i) {
1622             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1623           }
1624           // Keep extra threads hot in the team for possible next parallels
1625         }
1626         master_th->th.th_set_nproc = 0;
1627       }
1628 
1629 #if USE_DEBUGGER
1630       if (__kmp_debugging) { // Let debugger override number of threads.
1631         int nth = __kmp_omp_num_threads(loc);
1632         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1633           master_set_numthreads = nth;
1634         }
1635       }
1636 #endif
1637 
1638 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1639       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1640            KMP_ITT_DEBUG) &&
1641           __kmp_forkjoin_frames_mode == 3 &&
1642           parent_team->t.t_active_level == 1 // only report frames at level 1
1643           && master_th->th.th_teams_size.nteams == 1) {
1644         kmp_uint64 tmp_time = __itt_get_timestamp();
1645         master_th->th.th_frame_time = tmp_time;
1646         parent_team->t.t_region_time = tmp_time;
1647       }
1648       if (__itt_stack_caller_create_ptr) {
1649         // create new stack stitching id before entering fork barrier
1650         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1651       }
1652 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1653 
1654       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1655                     "master_th=%p, gtid=%d\n",
1656                     root, parent_team, master_th, gtid));
1657       __kmp_internal_fork(loc, gtid, parent_team);
1658       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1659                     "master_th=%p, gtid=%d\n",
1660                     root, parent_team, master_th, gtid));
1661 
1662       if (call_context == fork_context_gnu)
1663         return TRUE;
1664 
1665       /* Invoke microtask for MASTER thread */
1666       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1667                     parent_team->t.t_id, parent_team->t.t_pkfn));
1668 
1669       if (!parent_team->t.t_invoke(gtid)) {
1670         KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1671       }
1672       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1673                     parent_team->t.t_id, parent_team->t.t_pkfn));
1674       KMP_MB(); /* Flush all pending memory write invalidates.  */
1675 
1676       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1677 
1678       return TRUE;
1679     } // Parallel closely nested in teams construct
1680 
1681 #if KMP_DEBUG
1682     if (__kmp_tasking_mode != tskm_immediate_exec) {
1683       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1684                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1685     }
1686 #endif
1687 
1688     if (parent_team->t.t_active_level >=
1689         master_th->th.th_current_task->td_icvs.max_active_levels) {
1690       nthreads = 1;
1691     } else {
1692       int enter_teams = ((ap == NULL && active_level == 0) ||
1693                          (ap && teams_level > 0 && teams_level == level));
1694       nthreads =
1695           master_set_numthreads
1696               ? master_set_numthreads
1697               : get__nproc_2(
1698                     parent_team,
1699                     master_tid); // TODO: get nproc directly from current task
1700 
1701       // Check if we need to take forkjoin lock? (no need for serialized
1702       // parallel out of teams construct). This code moved here from
1703       // __kmp_reserve_threads() to speedup nested serialized parallels.
1704       if (nthreads > 1) {
1705         if ((get__max_active_levels(master_th) == 1 &&
1706              (root->r.r_in_parallel && !enter_teams)) ||
1707             (__kmp_library == library_serial)) {
1708           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1709                         " threads\n",
1710                         gtid, nthreads));
1711           nthreads = 1;
1712         }
1713       }
1714       if (nthreads > 1) {
1715         /* determine how many new threads we can use */
1716         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1717         /* AC: If we execute teams from parallel region (on host), then teams
1718            should be created but each can only have 1 thread if nesting is
1719            disabled. If teams called from serial region, then teams and their
1720            threads should be created regardless of the nesting setting. */
1721         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1722                                          nthreads, enter_teams);
1723         if (nthreads == 1) {
1724           // Free lock for single thread execution here; for multi-thread
1725           // execution it will be freed later after team of threads created
1726           // and initialized
1727           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1728         }
1729       }
1730     }
1731     KMP_DEBUG_ASSERT(nthreads > 0);
1732 
1733     // If we temporarily changed the set number of threads then restore it now
1734     master_th->th.th_set_nproc = 0;
1735 
1736     /* create a serialized parallel region? */
1737     if (nthreads == 1) {
1738 /* josh todo: hypothetical question: what do we do for OS X*? */
1739 #if KMP_OS_LINUX &&                                                            \
1740     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1741       void *args[argc];
1742 #else
1743       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1744 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1745           KMP_ARCH_AARCH64) */
1746 
1747       KA_TRACE(20,
1748                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1749 
1750       __kmpc_serialized_parallel(loc, gtid);
1751 
1752       if (call_context == fork_context_intel) {
1753         /* TODO this sucks, use the compiler itself to pass args! :) */
1754         master_th->th.th_serial_team->t.t_ident = loc;
1755         if (!ap) {
1756           // revert change made in __kmpc_serialized_parallel()
1757           master_th->th.th_serial_team->t.t_level--;
1758 // Get args from parent team for teams construct
1759 
1760 #if OMPT_SUPPORT
1761           void *dummy;
1762           void **exit_frame_p;
1763           ompt_task_info_t *task_info;
1764 
1765           ompt_lw_taskteam_t lw_taskteam;
1766 
1767           if (ompt_enabled.enabled) {
1768             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1769                                     &ompt_parallel_data, return_address);
1770 
1771             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1772             // don't use lw_taskteam after linking. content was swaped
1773 
1774             task_info = OMPT_CUR_TASK_INFO(master_th);
1775             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1776             if (ompt_enabled.ompt_callback_implicit_task) {
1777               OMPT_CUR_TASK_INFO(master_th)
1778                   ->thread_num = __kmp_tid_from_gtid(gtid);
1779               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1780                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1781                   &(task_info->task_data), 1,
1782                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1783                   ompt_task_implicit);
1784             }
1785 
1786             /* OMPT state */
1787             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1788           } else {
1789             exit_frame_p = &dummy;
1790           }
1791 #endif
1792 
1793           {
1794             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1795             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1796             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1797                                    parent_team->t.t_argv
1798 #if OMPT_SUPPORT
1799                                    ,
1800                                    exit_frame_p
1801 #endif
1802                                    );
1803           }
1804 
1805 #if OMPT_SUPPORT
1806           if (ompt_enabled.enabled) {
1807             *exit_frame_p = NULL;
1808             if (ompt_enabled.ompt_callback_implicit_task) {
1809               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1810                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1811                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1812                   ompt_task_implicit);
1813             }
1814             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1815             __ompt_lw_taskteam_unlink(master_th);
1816             if (ompt_enabled.ompt_callback_parallel_end) {
1817               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1818                   &ompt_parallel_data, parent_task_data,
1819                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1820                   return_address);
1821             }
1822             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1823           }
1824 #endif
1825         } else if (microtask == (microtask_t)__kmp_teams_master) {
1826           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1827                            master_th->th.th_serial_team);
1828           team = master_th->th.th_team;
1829           // team->t.t_pkfn = microtask;
1830           team->t.t_invoke = invoker;
1831           __kmp_alloc_argv_entries(argc, team, TRUE);
1832           team->t.t_argc = argc;
1833           argv = (void **)team->t.t_argv;
1834           if (ap) {
1835             for (i = argc - 1; i >= 0; --i)
1836               *argv++ = va_arg(kmp_va_deref(ap), void *);
1837           } else {
1838             for (i = 0; i < argc; ++i)
1839               // Get args from parent team for teams construct
1840               argv[i] = parent_team->t.t_argv[i];
1841           }
1842           // AC: revert change made in __kmpc_serialized_parallel()
1843           //     because initial code in teams should have level=0
1844           team->t.t_level--;
1845           // AC: call special invoker for outer "parallel" of teams construct
1846           invoker(gtid);
1847 #if OMPT_SUPPORT
1848           if (ompt_enabled.enabled) {
1849             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1850             if (ompt_enabled.ompt_callback_implicit_task) {
1851               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1852                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1853                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1854             }
1855             if (ompt_enabled.ompt_callback_parallel_end) {
1856               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1857                   &ompt_parallel_data, parent_task_data,
1858                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1859                   return_address);
1860             }
1861             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1862           }
1863 #endif
1864         } else {
1865           argv = args;
1866           for (i = argc - 1; i >= 0; --i)
1867             *argv++ = va_arg(kmp_va_deref(ap), void *);
1868           KMP_MB();
1869 
1870 #if OMPT_SUPPORT
1871           void *dummy;
1872           void **exit_frame_p;
1873           ompt_task_info_t *task_info;
1874 
1875           ompt_lw_taskteam_t lw_taskteam;
1876 
1877           if (ompt_enabled.enabled) {
1878             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1879                                     &ompt_parallel_data, return_address);
1880             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1881             // don't use lw_taskteam after linking. content was swaped
1882             task_info = OMPT_CUR_TASK_INFO(master_th);
1883             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1884 
1885             /* OMPT implicit task begin */
1886             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1887             if (ompt_enabled.ompt_callback_implicit_task) {
1888               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1889                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1890                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1891                   ompt_task_implicit);
1892               OMPT_CUR_TASK_INFO(master_th)
1893                   ->thread_num = __kmp_tid_from_gtid(gtid);
1894             }
1895 
1896             /* OMPT state */
1897             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1898           } else {
1899             exit_frame_p = &dummy;
1900           }
1901 #endif
1902 
1903           {
1904             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1905             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1906             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1907 #if OMPT_SUPPORT
1908                                    ,
1909                                    exit_frame_p
1910 #endif
1911                                    );
1912           }
1913 
1914 #if OMPT_SUPPORT
1915           if (ompt_enabled.enabled) {
1916             *exit_frame_p = NULL;
1917             if (ompt_enabled.ompt_callback_implicit_task) {
1918               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1919                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1920                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1921                   ompt_task_implicit);
1922             }
1923 
1924             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1925             __ompt_lw_taskteam_unlink(master_th);
1926             if (ompt_enabled.ompt_callback_parallel_end) {
1927               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1928                   &ompt_parallel_data, parent_task_data,
1929                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1930                   return_address);
1931             }
1932             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1933           }
1934 #endif
1935         }
1936       } else if (call_context == fork_context_gnu) {
1937 #if OMPT_SUPPORT
1938         ompt_lw_taskteam_t lwt;
1939         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1940                                 return_address);
1941 
1942         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1943         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1944 // don't use lw_taskteam after linking. content was swaped
1945 #endif
1946 
1947         // we were called from GNU native code
1948         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1949         return FALSE;
1950       } else {
1951         KMP_ASSERT2(call_context < fork_context_last,
1952                     "__kmp_fork_call: unknown fork_context parameter");
1953       }
1954 
1955       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1956       KMP_MB();
1957       return FALSE;
1958     } // if (nthreads == 1)
1959 
1960     // GEH: only modify the executing flag in the case when not serialized
1961     //      serialized case is handled in kmpc_serialized_parallel
1962     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1963                   "curtask=%p, curtask_max_aclevel=%d\n",
1964                   parent_team->t.t_active_level, master_th,
1965                   master_th->th.th_current_task,
1966                   master_th->th.th_current_task->td_icvs.max_active_levels));
1967     // TODO: GEH - cannot do this assertion because root thread not set up as
1968     // executing
1969     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1970     master_th->th.th_current_task->td_flags.executing = 0;
1971 
1972     if (!master_th->th.th_teams_microtask || level > teams_level) {
1973       /* Increment our nested depth level */
1974       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1975     }
1976 
1977     // See if we need to make a copy of the ICVs.
1978     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1979     if ((level + 1 < __kmp_nested_nth.used) &&
1980         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1981       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1982     } else {
1983       nthreads_icv = 0; // don't update
1984     }
1985 
1986     // Figure out the proc_bind_policy for the new team.
1987     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1988     kmp_proc_bind_t proc_bind_icv =
1989         proc_bind_default; // proc_bind_default means don't update
1990     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1991       proc_bind = proc_bind_false;
1992     } else {
1993       if (proc_bind == proc_bind_default) {
1994         // No proc_bind clause specified; use current proc-bind-var for this
1995         // parallel region
1996         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1997       }
1998       /* else: The proc_bind policy was specified explicitly on parallel clause.
1999          This overrides proc-bind-var for this parallel region, but does not
2000          change proc-bind-var. */
2001       // Figure the value of proc-bind-var for the child threads.
2002       if ((level + 1 < __kmp_nested_proc_bind.used) &&
2003           (__kmp_nested_proc_bind.bind_types[level + 1] !=
2004            master_th->th.th_current_task->td_icvs.proc_bind)) {
2005         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2006       }
2007     }
2008 
2009     // Reset for next parallel region
2010     master_th->th.th_set_proc_bind = proc_bind_default;
2011 
2012     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2013       kmp_internal_control_t new_icvs;
2014       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2015       new_icvs.next = NULL;
2016       if (nthreads_icv > 0) {
2017         new_icvs.nproc = nthreads_icv;
2018       }
2019       if (proc_bind_icv != proc_bind_default) {
2020         new_icvs.proc_bind = proc_bind_icv;
2021       }
2022 
2023       /* allocate a new parallel team */
2024       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2025       team = __kmp_allocate_team(root, nthreads, nthreads,
2026 #if OMPT_SUPPORT
2027                                  ompt_parallel_data,
2028 #endif
2029                                  proc_bind, &new_icvs,
2030                                  argc USE_NESTED_HOT_ARG(master_th));
2031     } else {
2032       /* allocate a new parallel team */
2033       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2034       team = __kmp_allocate_team(root, nthreads, nthreads,
2035 #if OMPT_SUPPORT
2036                                  ompt_parallel_data,
2037 #endif
2038                                  proc_bind,
2039                                  &master_th->th.th_current_task->td_icvs,
2040                                  argc USE_NESTED_HOT_ARG(master_th));
2041     }
2042     KF_TRACE(
2043         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2044 
2045     /* setup the new team */
2046     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2047     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2048     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2049     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2050     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2051 #if OMPT_SUPPORT
2052     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2053                           return_address);
2054 #endif
2055     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2056     // TODO: parent_team->t.t_level == INT_MAX ???
2057     if (!master_th->th.th_teams_microtask || level > teams_level) {
2058       int new_level = parent_team->t.t_level + 1;
2059       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2060       new_level = parent_team->t.t_active_level + 1;
2061       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2062     } else {
2063       // AC: Do not increase parallel level at start of the teams construct
2064       int new_level = parent_team->t.t_level;
2065       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2066       new_level = parent_team->t.t_active_level;
2067       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2068     }
2069     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2070     // set master's schedule as new run-time schedule
2071     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2072 
2073     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2074     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2075 
2076     // Update the floating point rounding in the team if required.
2077     propagateFPControl(team);
2078 
2079     if (__kmp_tasking_mode != tskm_immediate_exec) {
2080       // Set master's task team to team's task team. Unless this is hot team, it
2081       // should be NULL.
2082       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2083                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2084       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2085                     "%p, new task_team %p / team %p\n",
2086                     __kmp_gtid_from_thread(master_th),
2087                     master_th->th.th_task_team, parent_team,
2088                     team->t.t_task_team[master_th->th.th_task_state], team));
2089 
2090       if (active_level || master_th->th.th_task_team) {
2091         // Take a memo of master's task_state
2092         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2093         if (master_th->th.th_task_state_top >=
2094             master_th->th.th_task_state_stack_sz) { // increase size
2095           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2096           kmp_uint8 *old_stack, *new_stack;
2097           kmp_uint32 i;
2098           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2099           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2100             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2101           }
2102           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2103                ++i) { // zero-init rest of stack
2104             new_stack[i] = 0;
2105           }
2106           old_stack = master_th->th.th_task_state_memo_stack;
2107           master_th->th.th_task_state_memo_stack = new_stack;
2108           master_th->th.th_task_state_stack_sz = new_size;
2109           __kmp_free(old_stack);
2110         }
2111         // Store master's task_state on stack
2112         master_th->th
2113             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2114             master_th->th.th_task_state;
2115         master_th->th.th_task_state_top++;
2116 #if KMP_NESTED_HOT_TEAMS
2117         if (master_th->th.th_hot_teams &&
2118             active_level < __kmp_hot_teams_max_level &&
2119             team == master_th->th.th_hot_teams[active_level].hot_team) {
2120           // Restore master's nested state if nested hot team
2121           master_th->th.th_task_state =
2122               master_th->th
2123                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2124         } else {
2125 #endif
2126           master_th->th.th_task_state = 0;
2127 #if KMP_NESTED_HOT_TEAMS
2128         }
2129 #endif
2130       }
2131 #if !KMP_NESTED_HOT_TEAMS
2132       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2133                        (team == root->r.r_hot_team));
2134 #endif
2135     }
2136 
2137     KA_TRACE(
2138         20,
2139         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2140          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2141          team->t.t_nproc));
2142     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2143                      (team->t.t_master_tid == 0 &&
2144                       (team->t.t_parent == root->r.r_root_team ||
2145                        team->t.t_parent->t.t_serialized)));
2146     KMP_MB();
2147 
2148     /* now, setup the arguments */
2149     argv = (void **)team->t.t_argv;
2150     if (ap) {
2151       for (i = argc - 1; i >= 0; --i) {
2152         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2153         KMP_CHECK_UPDATE(*argv, new_argv);
2154         argv++;
2155       }
2156     } else {
2157       for (i = 0; i < argc; ++i) {
2158         // Get args from parent team for teams construct
2159         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2160       }
2161     }
2162 
2163     /* now actually fork the threads */
2164     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2165     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2166       root->r.r_active = TRUE;
2167 
2168     __kmp_fork_team_threads(root, team, master_th, gtid);
2169     __kmp_setup_icv_copy(team, nthreads,
2170                          &master_th->th.th_current_task->td_icvs, loc);
2171 
2172 #if OMPT_SUPPORT
2173     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2174 #endif
2175 
2176     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2177 
2178 #if USE_ITT_BUILD
2179     if (team->t.t_active_level == 1 // only report frames at level 1
2180         && !master_th->th.th_teams_microtask) { // not in teams construct
2181 #if USE_ITT_NOTIFY
2182       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2183           (__kmp_forkjoin_frames_mode == 3 ||
2184            __kmp_forkjoin_frames_mode == 1)) {
2185         kmp_uint64 tmp_time = 0;
2186         if (__itt_get_timestamp_ptr)
2187           tmp_time = __itt_get_timestamp();
2188         // Internal fork - report frame begin
2189         master_th->th.th_frame_time = tmp_time;
2190         if (__kmp_forkjoin_frames_mode == 3)
2191           team->t.t_region_time = tmp_time;
2192       } else
2193 // only one notification scheme (either "submit" or "forking/joined", not both)
2194 #endif /* USE_ITT_NOTIFY */
2195           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2196               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2197         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2198         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2199       }
2200     }
2201 #endif /* USE_ITT_BUILD */
2202 
2203     /* now go on and do the work */
2204     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2205     KMP_MB();
2206     KF_TRACE(10,
2207              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2208               root, team, master_th, gtid));
2209 
2210 #if USE_ITT_BUILD
2211     if (__itt_stack_caller_create_ptr) {
2212       team->t.t_stack_id =
2213           __kmp_itt_stack_caller_create(); // create new stack stitching id
2214       // before entering fork barrier
2215     }
2216 #endif /* USE_ITT_BUILD */
2217 
2218     // AC: skip __kmp_internal_fork at teams construct, let only master
2219     // threads execute
2220     if (ap) {
2221       __kmp_internal_fork(loc, gtid, team);
2222       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2223                     "master_th=%p, gtid=%d\n",
2224                     root, team, master_th, gtid));
2225     }
2226 
2227     if (call_context == fork_context_gnu) {
2228       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2229       return TRUE;
2230     }
2231 
2232     /* Invoke microtask for MASTER thread */
2233     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2234                   team->t.t_id, team->t.t_pkfn));
2235   } // END of timer KMP_fork_call block
2236 
2237 #if KMP_STATS_ENABLED
2238   // If beginning a teams construct, then change thread state
2239   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2240   if (!ap) {
2241     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2242   }
2243 #endif
2244 
2245   if (!team->t.t_invoke(gtid)) {
2246     KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2247   }
2248 
2249 #if KMP_STATS_ENABLED
2250   // If was beginning of a teams construct, then reset thread state
2251   if (!ap) {
2252     KMP_SET_THREAD_STATE(previous_state);
2253   }
2254 #endif
2255 
2256   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2257                 team->t.t_id, team->t.t_pkfn));
2258   KMP_MB(); /* Flush all pending memory write invalidates.  */
2259 
2260   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2261 
2262 #if OMPT_SUPPORT
2263   if (ompt_enabled.enabled) {
2264     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2265   }
2266 #endif
2267 
2268   return TRUE;
2269 }
2270 
2271 #if OMPT_SUPPORT
2272 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2273                                             kmp_team_t *team) {
2274   // restore state outside the region
2275   thread->th.ompt_thread_info.state =
2276       ((team->t.t_serialized) ? ompt_state_work_serial
2277                               : ompt_state_work_parallel);
2278 }
2279 
2280 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2281                                    kmp_team_t *team, ompt_data_t *parallel_data,
2282                                    int flags, void *codeptr) {
2283   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2284   if (ompt_enabled.ompt_callback_parallel_end) {
2285     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2286         parallel_data, &(task_info->task_data), flags, codeptr);
2287   }
2288 
2289   task_info->frame.enter_frame = ompt_data_none;
2290   __kmp_join_restore_state(thread, team);
2291 }
2292 #endif
2293 
2294 void __kmp_join_call(ident_t *loc, int gtid
2295 #if OMPT_SUPPORT
2296                      ,
2297                      enum fork_context_e fork_context
2298 #endif
2299                      ,
2300                      int exit_teams) {
2301   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2302   kmp_team_t *team;
2303   kmp_team_t *parent_team;
2304   kmp_info_t *master_th;
2305   kmp_root_t *root;
2306   int master_active;
2307 
2308   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2309 
2310   /* setup current data */
2311   master_th = __kmp_threads[gtid];
2312   root = master_th->th.th_root;
2313   team = master_th->th.th_team;
2314   parent_team = team->t.t_parent;
2315 
2316   master_th->th.th_ident = loc;
2317 
2318 #if OMPT_SUPPORT
2319   void *team_microtask = (void *)team->t.t_pkfn;
2320   // For GOMP interface with serialized parallel, need the
2321   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2322   // and end-parallel events.
2323   if (ompt_enabled.enabled &&
2324       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2325     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2326   }
2327 #endif
2328 
2329 #if KMP_DEBUG
2330   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2331     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2332                   "th_task_team = %p\n",
2333                   __kmp_gtid_from_thread(master_th), team,
2334                   team->t.t_task_team[master_th->th.th_task_state],
2335                   master_th->th.th_task_team));
2336     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2337                      team->t.t_task_team[master_th->th.th_task_state]);
2338   }
2339 #endif
2340 
2341   if (team->t.t_serialized) {
2342     if (master_th->th.th_teams_microtask) {
2343       // We are in teams construct
2344       int level = team->t.t_level;
2345       int tlevel = master_th->th.th_teams_level;
2346       if (level == tlevel) {
2347         // AC: we haven't incremented it earlier at start of teams construct,
2348         //     so do it here - at the end of teams construct
2349         team->t.t_level++;
2350       } else if (level == tlevel + 1) {
2351         // AC: we are exiting parallel inside teams, need to increment
2352         // serialization in order to restore it in the next call to
2353         // __kmpc_end_serialized_parallel
2354         team->t.t_serialized++;
2355       }
2356     }
2357     __kmpc_end_serialized_parallel(loc, gtid);
2358 
2359 #if OMPT_SUPPORT
2360     if (ompt_enabled.enabled) {
2361       __kmp_join_restore_state(master_th, parent_team);
2362     }
2363 #endif
2364 
2365     return;
2366   }
2367 
2368   master_active = team->t.t_master_active;
2369 
2370   if (!exit_teams) {
2371     // AC: No barrier for internal teams at exit from teams construct.
2372     //     But there is barrier for external team (league).
2373     __kmp_internal_join(loc, gtid, team);
2374   } else {
2375     master_th->th.th_task_state =
2376         0; // AC: no tasking in teams (out of any parallel)
2377   }
2378 
2379   KMP_MB();
2380 
2381 #if OMPT_SUPPORT
2382   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2383   void *codeptr = team->t.ompt_team_info.master_return_address;
2384 #endif
2385 
2386 #if USE_ITT_BUILD
2387   if (__itt_stack_caller_create_ptr) {
2388     // destroy the stack stitching id after join barrier
2389     __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2390   }
2391   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2392   if (team->t.t_active_level == 1 &&
2393       (!master_th->th.th_teams_microtask || /* not in teams construct */
2394        master_th->th.th_teams_size.nteams == 1)) {
2395     master_th->th.th_ident = loc;
2396     // only one notification scheme (either "submit" or "forking/joined", not
2397     // both)
2398     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2399         __kmp_forkjoin_frames_mode == 3)
2400       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2401                              master_th->th.th_frame_time, 0, loc,
2402                              master_th->th.th_team_nproc, 1);
2403     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2404              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2405       __kmp_itt_region_joined(gtid);
2406   } // active_level == 1
2407 #endif /* USE_ITT_BUILD */
2408 
2409   if (master_th->th.th_teams_microtask && !exit_teams &&
2410       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2411       team->t.t_level == master_th->th.th_teams_level + 1) {
2412 // AC: We need to leave the team structure intact at the end of parallel
2413 // inside the teams construct, so that at the next parallel same (hot) team
2414 // works, only adjust nesting levels
2415 #if OMPT_SUPPORT
2416     ompt_data_t ompt_parallel_data = ompt_data_none;
2417     if (ompt_enabled.enabled) {
2418       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2419       if (ompt_enabled.ompt_callback_implicit_task) {
2420         int ompt_team_size = team->t.t_nproc;
2421         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2422             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2423             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2424       }
2425       task_info->frame.exit_frame = ompt_data_none;
2426       task_info->task_data = ompt_data_none;
2427       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2428       __ompt_lw_taskteam_unlink(master_th);
2429     }
2430 #endif
2431     /* Decrement our nested depth level */
2432     team->t.t_level--;
2433     team->t.t_active_level--;
2434     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2435 
2436     // Restore number of threads in the team if needed. This code relies on
2437     // the proper adjustment of th_teams_size.nth after the fork in
2438     // __kmp_teams_master on each teams master in the case that
2439     // __kmp_reserve_threads reduced it.
2440     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2441       int old_num = master_th->th.th_team_nproc;
2442       int new_num = master_th->th.th_teams_size.nth;
2443       kmp_info_t **other_threads = team->t.t_threads;
2444       team->t.t_nproc = new_num;
2445       for (int i = 0; i < old_num; ++i) {
2446         other_threads[i]->th.th_team_nproc = new_num;
2447       }
2448       // Adjust states of non-used threads of the team
2449       for (int i = old_num; i < new_num; ++i) {
2450         // Re-initialize thread's barrier data.
2451         KMP_DEBUG_ASSERT(other_threads[i]);
2452         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2453         for (int b = 0; b < bs_last_barrier; ++b) {
2454           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2455           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2456 #if USE_DEBUGGER
2457           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2458 #endif
2459         }
2460         if (__kmp_tasking_mode != tskm_immediate_exec) {
2461           // Synchronize thread's task state
2462           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2463         }
2464       }
2465     }
2466 
2467 #if OMPT_SUPPORT
2468     if (ompt_enabled.enabled) {
2469       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2470                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2471     }
2472 #endif
2473 
2474     return;
2475   }
2476 
2477   /* do cleanup and restore the parent team */
2478   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2479   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2480 
2481   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2482 
2483   /* jc: The following lock has instructions with REL and ACQ semantics,
2484      separating the parallel user code called in this parallel region
2485      from the serial user code called after this function returns. */
2486   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2487 
2488   if (!master_th->th.th_teams_microtask ||
2489       team->t.t_level > master_th->th.th_teams_level) {
2490     /* Decrement our nested depth level */
2491     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2492   }
2493   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2494 
2495 #if OMPT_SUPPORT
2496   if (ompt_enabled.enabled) {
2497     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2498     if (ompt_enabled.ompt_callback_implicit_task) {
2499       int flags = (team_microtask == (void *)__kmp_teams_master)
2500                       ? ompt_task_initial
2501                       : ompt_task_implicit;
2502       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2503       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2504           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2505           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2506     }
2507     task_info->frame.exit_frame = ompt_data_none;
2508     task_info->task_data = ompt_data_none;
2509   }
2510 #endif
2511 
2512   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2513                 master_th, team));
2514   __kmp_pop_current_task_from_thread(master_th);
2515 
2516 #if KMP_AFFINITY_SUPPORTED
2517   // Restore master thread's partition.
2518   master_th->th.th_first_place = team->t.t_first_place;
2519   master_th->th.th_last_place = team->t.t_last_place;
2520 #endif // KMP_AFFINITY_SUPPORTED
2521   master_th->th.th_def_allocator = team->t.t_def_allocator;
2522 
2523   updateHWFPControl(team);
2524 
2525   if (root->r.r_active != master_active)
2526     root->r.r_active = master_active;
2527 
2528   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2529                             master_th)); // this will free worker threads
2530 
2531   /* this race was fun to find. make sure the following is in the critical
2532      region otherwise assertions may fail occasionally since the old team may be
2533      reallocated and the hierarchy appears inconsistent. it is actually safe to
2534      run and won't cause any bugs, but will cause those assertion failures. it's
2535      only one deref&assign so might as well put this in the critical region */
2536   master_th->th.th_team = parent_team;
2537   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2538   master_th->th.th_team_master = parent_team->t.t_threads[0];
2539   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2540 
2541   /* restore serialized team, if need be */
2542   if (parent_team->t.t_serialized &&
2543       parent_team != master_th->th.th_serial_team &&
2544       parent_team != root->r.r_root_team) {
2545     __kmp_free_team(root,
2546                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2547     master_th->th.th_serial_team = parent_team;
2548   }
2549 
2550   if (__kmp_tasking_mode != tskm_immediate_exec) {
2551     if (master_th->th.th_task_state_top >
2552         0) { // Restore task state from memo stack
2553       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2554       // Remember master's state if we re-use this nested hot team
2555       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2556           master_th->th.th_task_state;
2557       --master_th->th.th_task_state_top; // pop
2558       // Now restore state at this level
2559       master_th->th.th_task_state =
2560           master_th->th
2561               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2562     }
2563     // Copy the task team from the parent team to the master thread
2564     master_th->th.th_task_team =
2565         parent_team->t.t_task_team[master_th->th.th_task_state];
2566     KA_TRACE(20,
2567              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2568               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2569               parent_team));
2570   }
2571 
2572   // TODO: GEH - cannot do this assertion because root thread not set up as
2573   // executing
2574   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2575   master_th->th.th_current_task->td_flags.executing = 1;
2576 
2577   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2578 
2579 #if OMPT_SUPPORT
2580   int flags =
2581       OMPT_INVOKER(fork_context) |
2582       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2583                                                       : ompt_parallel_team);
2584   if (ompt_enabled.enabled) {
2585     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2586                     codeptr);
2587   }
2588 #endif
2589 
2590   KMP_MB();
2591   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2592 }
2593 
2594 /* Check whether we should push an internal control record onto the
2595    serial team stack.  If so, do it.  */
2596 void __kmp_save_internal_controls(kmp_info_t *thread) {
2597 
2598   if (thread->th.th_team != thread->th.th_serial_team) {
2599     return;
2600   }
2601   if (thread->th.th_team->t.t_serialized > 1) {
2602     int push = 0;
2603 
2604     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2605       push = 1;
2606     } else {
2607       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2608           thread->th.th_team->t.t_serialized) {
2609         push = 1;
2610       }
2611     }
2612     if (push) { /* push a record on the serial team's stack */
2613       kmp_internal_control_t *control =
2614           (kmp_internal_control_t *)__kmp_allocate(
2615               sizeof(kmp_internal_control_t));
2616 
2617       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2618 
2619       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2620 
2621       control->next = thread->th.th_team->t.t_control_stack_top;
2622       thread->th.th_team->t.t_control_stack_top = control;
2623     }
2624   }
2625 }
2626 
2627 /* Changes set_nproc */
2628 void __kmp_set_num_threads(int new_nth, int gtid) {
2629   kmp_info_t *thread;
2630   kmp_root_t *root;
2631 
2632   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2633   KMP_DEBUG_ASSERT(__kmp_init_serial);
2634 
2635   if (new_nth < 1)
2636     new_nth = 1;
2637   else if (new_nth > __kmp_max_nth)
2638     new_nth = __kmp_max_nth;
2639 
2640   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2641   thread = __kmp_threads[gtid];
2642   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2643     return; // nothing to do
2644 
2645   __kmp_save_internal_controls(thread);
2646 
2647   set__nproc(thread, new_nth);
2648 
2649   // If this omp_set_num_threads() call will cause the hot team size to be
2650   // reduced (in the absence of a num_threads clause), then reduce it now,
2651   // rather than waiting for the next parallel region.
2652   root = thread->th.th_root;
2653   if (__kmp_init_parallel && (!root->r.r_active) &&
2654       (root->r.r_hot_team->t.t_nproc > new_nth)
2655 #if KMP_NESTED_HOT_TEAMS
2656       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2657 #endif
2658       ) {
2659     kmp_team_t *hot_team = root->r.r_hot_team;
2660     int f;
2661 
2662     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2663 
2664     // Release the extra threads we don't need any more.
2665     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2666       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2667       if (__kmp_tasking_mode != tskm_immediate_exec) {
2668         // When decreasing team size, threads no longer in the team should unref
2669         // task team.
2670         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2671       }
2672       __kmp_free_thread(hot_team->t.t_threads[f]);
2673       hot_team->t.t_threads[f] = NULL;
2674     }
2675     hot_team->t.t_nproc = new_nth;
2676 #if KMP_NESTED_HOT_TEAMS
2677     if (thread->th.th_hot_teams) {
2678       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2679       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2680     }
2681 #endif
2682 
2683     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2684 
2685     // Update the t_nproc field in the threads that are still active.
2686     for (f = 0; f < new_nth; f++) {
2687       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2688       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2689     }
2690     // Special flag in case omp_set_num_threads() call
2691     hot_team->t.t_size_changed = -1;
2692   }
2693 }
2694 
2695 /* Changes max_active_levels */
2696 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2697   kmp_info_t *thread;
2698 
2699   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2700                 "%d = (%d)\n",
2701                 gtid, max_active_levels));
2702   KMP_DEBUG_ASSERT(__kmp_init_serial);
2703 
2704   // validate max_active_levels
2705   if (max_active_levels < 0) {
2706     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2707     // We ignore this call if the user has specified a negative value.
2708     // The current setting won't be changed. The last valid setting will be
2709     // used. A warning will be issued (if warnings are allowed as controlled by
2710     // the KMP_WARNINGS env var).
2711     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2712                   "max_active_levels for thread %d = (%d)\n",
2713                   gtid, max_active_levels));
2714     return;
2715   }
2716   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2717     // it's OK, the max_active_levels is within the valid range: [ 0;
2718     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2719     // We allow a zero value. (implementation defined behavior)
2720   } else {
2721     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2722                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2723     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2724     // Current upper limit is MAX_INT. (implementation defined behavior)
2725     // If the input exceeds the upper limit, we correct the input to be the
2726     // upper limit. (implementation defined behavior)
2727     // Actually, the flow should never get here until we use MAX_INT limit.
2728   }
2729   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2730                 "max_active_levels for thread %d = (%d)\n",
2731                 gtid, max_active_levels));
2732 
2733   thread = __kmp_threads[gtid];
2734 
2735   __kmp_save_internal_controls(thread);
2736 
2737   set__max_active_levels(thread, max_active_levels);
2738 }
2739 
2740 /* Gets max_active_levels */
2741 int __kmp_get_max_active_levels(int gtid) {
2742   kmp_info_t *thread;
2743 
2744   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2745   KMP_DEBUG_ASSERT(__kmp_init_serial);
2746 
2747   thread = __kmp_threads[gtid];
2748   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2749   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2750                 "curtask_maxaclevel=%d\n",
2751                 gtid, thread->th.th_current_task,
2752                 thread->th.th_current_task->td_icvs.max_active_levels));
2753   return thread->th.th_current_task->td_icvs.max_active_levels;
2754 }
2755 
2756 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2757 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2758 
2759 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2760 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2761   kmp_info_t *thread;
2762   kmp_sched_t orig_kind;
2763   //    kmp_team_t *team;
2764 
2765   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2766                 gtid, (int)kind, chunk));
2767   KMP_DEBUG_ASSERT(__kmp_init_serial);
2768 
2769   // Check if the kind parameter is valid, correct if needed.
2770   // Valid parameters should fit in one of two intervals - standard or extended:
2771   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2772   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2773   orig_kind = kind;
2774   kind = __kmp_sched_without_mods(kind);
2775 
2776   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2777       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2778     // TODO: Hint needs attention in case we change the default schedule.
2779     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2780               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2781               __kmp_msg_null);
2782     kind = kmp_sched_default;
2783     chunk = 0; // ignore chunk value in case of bad kind
2784   }
2785 
2786   thread = __kmp_threads[gtid];
2787 
2788   __kmp_save_internal_controls(thread);
2789 
2790   if (kind < kmp_sched_upper_std) {
2791     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2792       // differ static chunked vs. unchunked:  chunk should be invalid to
2793       // indicate unchunked schedule (which is the default)
2794       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2795     } else {
2796       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2797           __kmp_sch_map[kind - kmp_sched_lower - 1];
2798     }
2799   } else {
2800     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2801     //    kmp_sched_lower - 2 ];
2802     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2803         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2804                       kmp_sched_lower - 2];
2805   }
2806   __kmp_sched_apply_mods_intkind(
2807       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2808   if (kind == kmp_sched_auto || chunk < 1) {
2809     // ignore parameter chunk for schedule auto
2810     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2811   } else {
2812     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2813   }
2814 }
2815 
2816 /* Gets def_sched_var ICV values */
2817 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2818   kmp_info_t *thread;
2819   enum sched_type th_type;
2820 
2821   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2822   KMP_DEBUG_ASSERT(__kmp_init_serial);
2823 
2824   thread = __kmp_threads[gtid];
2825 
2826   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2827   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2828   case kmp_sch_static:
2829   case kmp_sch_static_greedy:
2830   case kmp_sch_static_balanced:
2831     *kind = kmp_sched_static;
2832     __kmp_sched_apply_mods_stdkind(kind, th_type);
2833     *chunk = 0; // chunk was not set, try to show this fact via zero value
2834     return;
2835   case kmp_sch_static_chunked:
2836     *kind = kmp_sched_static;
2837     break;
2838   case kmp_sch_dynamic_chunked:
2839     *kind = kmp_sched_dynamic;
2840     break;
2841   case kmp_sch_guided_chunked:
2842   case kmp_sch_guided_iterative_chunked:
2843   case kmp_sch_guided_analytical_chunked:
2844     *kind = kmp_sched_guided;
2845     break;
2846   case kmp_sch_auto:
2847     *kind = kmp_sched_auto;
2848     break;
2849   case kmp_sch_trapezoidal:
2850     *kind = kmp_sched_trapezoidal;
2851     break;
2852 #if KMP_STATIC_STEAL_ENABLED
2853   case kmp_sch_static_steal:
2854     *kind = kmp_sched_static_steal;
2855     break;
2856 #endif
2857   default:
2858     KMP_FATAL(UnknownSchedulingType, th_type);
2859   }
2860 
2861   __kmp_sched_apply_mods_stdkind(kind, th_type);
2862   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2863 }
2864 
2865 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2866 
2867   int ii, dd;
2868   kmp_team_t *team;
2869   kmp_info_t *thr;
2870 
2871   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2872   KMP_DEBUG_ASSERT(__kmp_init_serial);
2873 
2874   // validate level
2875   if (level == 0)
2876     return 0;
2877   if (level < 0)
2878     return -1;
2879   thr = __kmp_threads[gtid];
2880   team = thr->th.th_team;
2881   ii = team->t.t_level;
2882   if (level > ii)
2883     return -1;
2884 
2885   if (thr->th.th_teams_microtask) {
2886     // AC: we are in teams region where multiple nested teams have same level
2887     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2888     if (level <=
2889         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2890       KMP_DEBUG_ASSERT(ii >= tlevel);
2891       // AC: As we need to pass by the teams league, we need to artificially
2892       // increase ii
2893       if (ii == tlevel) {
2894         ii += 2; // three teams have same level
2895       } else {
2896         ii++; // two teams have same level
2897       }
2898     }
2899   }
2900 
2901   if (ii == level)
2902     return __kmp_tid_from_gtid(gtid);
2903 
2904   dd = team->t.t_serialized;
2905   level++;
2906   while (ii > level) {
2907     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2908     }
2909     if ((team->t.t_serialized) && (!dd)) {
2910       team = team->t.t_parent;
2911       continue;
2912     }
2913     if (ii > level) {
2914       team = team->t.t_parent;
2915       dd = team->t.t_serialized;
2916       ii--;
2917     }
2918   }
2919 
2920   return (dd > 1) ? (0) : (team->t.t_master_tid);
2921 }
2922 
2923 int __kmp_get_team_size(int gtid, int level) {
2924 
2925   int ii, dd;
2926   kmp_team_t *team;
2927   kmp_info_t *thr;
2928 
2929   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2930   KMP_DEBUG_ASSERT(__kmp_init_serial);
2931 
2932   // validate level
2933   if (level == 0)
2934     return 1;
2935   if (level < 0)
2936     return -1;
2937   thr = __kmp_threads[gtid];
2938   team = thr->th.th_team;
2939   ii = team->t.t_level;
2940   if (level > ii)
2941     return -1;
2942 
2943   if (thr->th.th_teams_microtask) {
2944     // AC: we are in teams region where multiple nested teams have same level
2945     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2946     if (level <=
2947         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2948       KMP_DEBUG_ASSERT(ii >= tlevel);
2949       // AC: As we need to pass by the teams league, we need to artificially
2950       // increase ii
2951       if (ii == tlevel) {
2952         ii += 2; // three teams have same level
2953       } else {
2954         ii++; // two teams have same level
2955       }
2956     }
2957   }
2958 
2959   while (ii > level) {
2960     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2961     }
2962     if (team->t.t_serialized && (!dd)) {
2963       team = team->t.t_parent;
2964       continue;
2965     }
2966     if (ii > level) {
2967       team = team->t.t_parent;
2968       ii--;
2969     }
2970   }
2971 
2972   return team->t.t_nproc;
2973 }
2974 
2975 kmp_r_sched_t __kmp_get_schedule_global() {
2976   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2977   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2978   // independently. So one can get the updated schedule here.
2979 
2980   kmp_r_sched_t r_sched;
2981 
2982   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2983   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2984   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2985   // different roots (even in OMP 2.5)
2986   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2987   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2988   if (s == kmp_sch_static) {
2989     // replace STATIC with more detailed schedule (balanced or greedy)
2990     r_sched.r_sched_type = __kmp_static;
2991   } else if (s == kmp_sch_guided_chunked) {
2992     // replace GUIDED with more detailed schedule (iterative or analytical)
2993     r_sched.r_sched_type = __kmp_guided;
2994   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2995     r_sched.r_sched_type = __kmp_sched;
2996   }
2997   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2998 
2999   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3000     // __kmp_chunk may be wrong here (if it was not ever set)
3001     r_sched.chunk = KMP_DEFAULT_CHUNK;
3002   } else {
3003     r_sched.chunk = __kmp_chunk;
3004   }
3005 
3006   return r_sched;
3007 }
3008 
3009 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3010    at least argc number of *t_argv entries for the requested team. */
3011 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3012 
3013   KMP_DEBUG_ASSERT(team);
3014   if (!realloc || argc > team->t.t_max_argc) {
3015 
3016     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3017                    "current entries=%d\n",
3018                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3019     /* if previously allocated heap space for args, free them */
3020     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3021       __kmp_free((void *)team->t.t_argv);
3022 
3023     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3024       /* use unused space in the cache line for arguments */
3025       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3026       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3027                      "argv entries\n",
3028                      team->t.t_id, team->t.t_max_argc));
3029       team->t.t_argv = &team->t.t_inline_argv[0];
3030       if (__kmp_storage_map) {
3031         __kmp_print_storage_map_gtid(
3032             -1, &team->t.t_inline_argv[0],
3033             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3034             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3035             team->t.t_id);
3036       }
3037     } else {
3038       /* allocate space for arguments in the heap */
3039       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3040                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3041                                : 2 * argc;
3042       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3043                      "argv entries\n",
3044                      team->t.t_id, team->t.t_max_argc));
3045       team->t.t_argv =
3046           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3047       if (__kmp_storage_map) {
3048         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3049                                      &team->t.t_argv[team->t.t_max_argc],
3050                                      sizeof(void *) * team->t.t_max_argc,
3051                                      "team_%d.t_argv", team->t.t_id);
3052       }
3053     }
3054   }
3055 }
3056 
3057 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3058   int i;
3059   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3060   team->t.t_threads =
3061       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3062   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3063       sizeof(dispatch_shared_info_t) * num_disp_buff);
3064   team->t.t_dispatch =
3065       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3066   team->t.t_implicit_task_taskdata =
3067       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3068   team->t.t_max_nproc = max_nth;
3069 
3070   /* setup dispatch buffers */
3071   for (i = 0; i < num_disp_buff; ++i) {
3072     team->t.t_disp_buffer[i].buffer_index = i;
3073     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3074   }
3075 }
3076 
3077 static void __kmp_free_team_arrays(kmp_team_t *team) {
3078   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3079   int i;
3080   for (i = 0; i < team->t.t_max_nproc; ++i) {
3081     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3082       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3083       team->t.t_dispatch[i].th_disp_buffer = NULL;
3084     }
3085   }
3086 #if KMP_USE_HIER_SCHED
3087   __kmp_dispatch_free_hierarchies(team);
3088 #endif
3089   __kmp_free(team->t.t_threads);
3090   __kmp_free(team->t.t_disp_buffer);
3091   __kmp_free(team->t.t_dispatch);
3092   __kmp_free(team->t.t_implicit_task_taskdata);
3093   team->t.t_threads = NULL;
3094   team->t.t_disp_buffer = NULL;
3095   team->t.t_dispatch = NULL;
3096   team->t.t_implicit_task_taskdata = 0;
3097 }
3098 
3099 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3100   kmp_info_t **oldThreads = team->t.t_threads;
3101 
3102   __kmp_free(team->t.t_disp_buffer);
3103   __kmp_free(team->t.t_dispatch);
3104   __kmp_free(team->t.t_implicit_task_taskdata);
3105   __kmp_allocate_team_arrays(team, max_nth);
3106 
3107   KMP_MEMCPY(team->t.t_threads, oldThreads,
3108              team->t.t_nproc * sizeof(kmp_info_t *));
3109 
3110   __kmp_free(oldThreads);
3111 }
3112 
3113 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3114 
3115   kmp_r_sched_t r_sched =
3116       __kmp_get_schedule_global(); // get current state of scheduling globals
3117 
3118   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3119 
3120   kmp_internal_control_t g_icvs = {
3121     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3122     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3123     // adjustment of threads (per thread)
3124     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3125     // whether blocktime is explicitly set
3126     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3127 #if KMP_USE_MONITOR
3128     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3129 // intervals
3130 #endif
3131     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3132     // next parallel region (per thread)
3133     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3134     __kmp_cg_max_nth, // int thread_limit;
3135     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3136     // for max_active_levels
3137     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3138     // {sched,chunk} pair
3139     __kmp_nested_proc_bind.bind_types[0],
3140     __kmp_default_device,
3141     NULL // struct kmp_internal_control *next;
3142   };
3143 
3144   return g_icvs;
3145 }
3146 
3147 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3148 
3149   kmp_internal_control_t gx_icvs;
3150   gx_icvs.serial_nesting_level =
3151       0; // probably =team->t.t_serial like in save_inter_controls
3152   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3153   gx_icvs.next = NULL;
3154 
3155   return gx_icvs;
3156 }
3157 
3158 static void __kmp_initialize_root(kmp_root_t *root) {
3159   int f;
3160   kmp_team_t *root_team;
3161   kmp_team_t *hot_team;
3162   int hot_team_max_nth;
3163   kmp_r_sched_t r_sched =
3164       __kmp_get_schedule_global(); // get current state of scheduling globals
3165   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3166   KMP_DEBUG_ASSERT(root);
3167   KMP_ASSERT(!root->r.r_begin);
3168 
3169   /* setup the root state structure */
3170   __kmp_init_lock(&root->r.r_begin_lock);
3171   root->r.r_begin = FALSE;
3172   root->r.r_active = FALSE;
3173   root->r.r_in_parallel = 0;
3174   root->r.r_blocktime = __kmp_dflt_blocktime;
3175 
3176   /* setup the root team for this task */
3177   /* allocate the root team structure */
3178   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3179 
3180   root_team =
3181       __kmp_allocate_team(root,
3182                           1, // new_nproc
3183                           1, // max_nproc
3184 #if OMPT_SUPPORT
3185                           ompt_data_none, // root parallel id
3186 #endif
3187                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3188                           0 // argc
3189                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3190                           );
3191 #if USE_DEBUGGER
3192   // Non-NULL value should be assigned to make the debugger display the root
3193   // team.
3194   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3195 #endif
3196 
3197   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3198 
3199   root->r.r_root_team = root_team;
3200   root_team->t.t_control_stack_top = NULL;
3201 
3202   /* initialize root team */
3203   root_team->t.t_threads[0] = NULL;
3204   root_team->t.t_nproc = 1;
3205   root_team->t.t_serialized = 1;
3206   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3207   root_team->t.t_sched.sched = r_sched.sched;
3208   KA_TRACE(
3209       20,
3210       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3211        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3212 
3213   /* setup the  hot team for this task */
3214   /* allocate the hot team structure */
3215   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3216 
3217   hot_team =
3218       __kmp_allocate_team(root,
3219                           1, // new_nproc
3220                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3221 #if OMPT_SUPPORT
3222                           ompt_data_none, // root parallel id
3223 #endif
3224                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3225                           0 // argc
3226                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3227                           );
3228   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3229 
3230   root->r.r_hot_team = hot_team;
3231   root_team->t.t_control_stack_top = NULL;
3232 
3233   /* first-time initialization */
3234   hot_team->t.t_parent = root_team;
3235 
3236   /* initialize hot team */
3237   hot_team_max_nth = hot_team->t.t_max_nproc;
3238   for (f = 0; f < hot_team_max_nth; ++f) {
3239     hot_team->t.t_threads[f] = NULL;
3240   }
3241   hot_team->t.t_nproc = 1;
3242   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3243   hot_team->t.t_sched.sched = r_sched.sched;
3244   hot_team->t.t_size_changed = 0;
3245 }
3246 
3247 #ifdef KMP_DEBUG
3248 
3249 typedef struct kmp_team_list_item {
3250   kmp_team_p const *entry;
3251   struct kmp_team_list_item *next;
3252 } kmp_team_list_item_t;
3253 typedef kmp_team_list_item_t *kmp_team_list_t;
3254 
3255 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3256     kmp_team_list_t list, // List of teams.
3257     kmp_team_p const *team // Team to add.
3258     ) {
3259 
3260   // List must terminate with item where both entry and next are NULL.
3261   // Team is added to the list only once.
3262   // List is sorted in ascending order by team id.
3263   // Team id is *not* a key.
3264 
3265   kmp_team_list_t l;
3266 
3267   KMP_DEBUG_ASSERT(list != NULL);
3268   if (team == NULL) {
3269     return;
3270   }
3271 
3272   __kmp_print_structure_team_accum(list, team->t.t_parent);
3273   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3274 
3275   // Search list for the team.
3276   l = list;
3277   while (l->next != NULL && l->entry != team) {
3278     l = l->next;
3279   }
3280   if (l->next != NULL) {
3281     return; // Team has been added before, exit.
3282   }
3283 
3284   // Team is not found. Search list again for insertion point.
3285   l = list;
3286   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3287     l = l->next;
3288   }
3289 
3290   // Insert team.
3291   {
3292     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3293         sizeof(kmp_team_list_item_t));
3294     *item = *l;
3295     l->entry = team;
3296     l->next = item;
3297   }
3298 }
3299 
3300 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3301 
3302                                        ) {
3303   __kmp_printf("%s", title);
3304   if (team != NULL) {
3305     __kmp_printf("%2x %p\n", team->t.t_id, team);
3306   } else {
3307     __kmp_printf(" - (nil)\n");
3308   }
3309 }
3310 
3311 static void __kmp_print_structure_thread(char const *title,
3312                                          kmp_info_p const *thread) {
3313   __kmp_printf("%s", title);
3314   if (thread != NULL) {
3315     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3316   } else {
3317     __kmp_printf(" - (nil)\n");
3318   }
3319 }
3320 
3321 void __kmp_print_structure(void) {
3322 
3323   kmp_team_list_t list;
3324 
3325   // Initialize list of teams.
3326   list =
3327       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3328   list->entry = NULL;
3329   list->next = NULL;
3330 
3331   __kmp_printf("\n------------------------------\nGlobal Thread "
3332                "Table\n------------------------------\n");
3333   {
3334     int gtid;
3335     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3336       __kmp_printf("%2d", gtid);
3337       if (__kmp_threads != NULL) {
3338         __kmp_printf(" %p", __kmp_threads[gtid]);
3339       }
3340       if (__kmp_root != NULL) {
3341         __kmp_printf(" %p", __kmp_root[gtid]);
3342       }
3343       __kmp_printf("\n");
3344     }
3345   }
3346 
3347   // Print out __kmp_threads array.
3348   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3349                "----------\n");
3350   if (__kmp_threads != NULL) {
3351     int gtid;
3352     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3353       kmp_info_t const *thread = __kmp_threads[gtid];
3354       if (thread != NULL) {
3355         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3356         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3357         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3358         __kmp_print_structure_team("    Serial Team:  ",
3359                                    thread->th.th_serial_team);
3360         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3361         __kmp_print_structure_thread("    Master:       ",
3362                                      thread->th.th_team_master);
3363         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3364         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3365         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3366         __kmp_print_structure_thread("    Next in pool: ",
3367                                      thread->th.th_next_pool);
3368         __kmp_printf("\n");
3369         __kmp_print_structure_team_accum(list, thread->th.th_team);
3370         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3371       }
3372     }
3373   } else {
3374     __kmp_printf("Threads array is not allocated.\n");
3375   }
3376 
3377   // Print out __kmp_root array.
3378   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3379                "--------\n");
3380   if (__kmp_root != NULL) {
3381     int gtid;
3382     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3383       kmp_root_t const *root = __kmp_root[gtid];
3384       if (root != NULL) {
3385         __kmp_printf("GTID %2d %p:\n", gtid, root);
3386         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3387         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3388         __kmp_print_structure_thread("    Uber Thread:  ",
3389                                      root->r.r_uber_thread);
3390         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3391         __kmp_printf("    In Parallel:  %2d\n",
3392                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3393         __kmp_printf("\n");
3394         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3395         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3396       }
3397     }
3398   } else {
3399     __kmp_printf("Ubers array is not allocated.\n");
3400   }
3401 
3402   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3403                "--------\n");
3404   while (list->next != NULL) {
3405     kmp_team_p const *team = list->entry;
3406     int i;
3407     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3408     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3409     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3410     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3411     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3412     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3413     for (i = 0; i < team->t.t_nproc; ++i) {
3414       __kmp_printf("    Thread %2d:      ", i);
3415       __kmp_print_structure_thread("", team->t.t_threads[i]);
3416     }
3417     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3418     __kmp_printf("\n");
3419     list = list->next;
3420   }
3421 
3422   // Print out __kmp_thread_pool and __kmp_team_pool.
3423   __kmp_printf("\n------------------------------\nPools\n----------------------"
3424                "--------\n");
3425   __kmp_print_structure_thread("Thread pool:          ",
3426                                CCAST(kmp_info_t *, __kmp_thread_pool));
3427   __kmp_print_structure_team("Team pool:            ",
3428                              CCAST(kmp_team_t *, __kmp_team_pool));
3429   __kmp_printf("\n");
3430 
3431   // Free team list.
3432   while (list != NULL) {
3433     kmp_team_list_item_t *item = list;
3434     list = list->next;
3435     KMP_INTERNAL_FREE(item);
3436   }
3437 }
3438 
3439 #endif
3440 
3441 //---------------------------------------------------------------------------
3442 //  Stuff for per-thread fast random number generator
3443 //  Table of primes
3444 static const unsigned __kmp_primes[] = {
3445     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3446     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3447     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3448     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3449     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3450     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3451     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3452     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3453     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3454     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3455     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3456 
3457 //---------------------------------------------------------------------------
3458 //  __kmp_get_random: Get a random number using a linear congruential method.
3459 unsigned short __kmp_get_random(kmp_info_t *thread) {
3460   unsigned x = thread->th.th_x;
3461   unsigned short r = (unsigned short)(x >> 16);
3462 
3463   thread->th.th_x = x * thread->th.th_a + 1;
3464 
3465   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3466                 thread->th.th_info.ds.ds_tid, r));
3467 
3468   return r;
3469 }
3470 //--------------------------------------------------------
3471 // __kmp_init_random: Initialize a random number generator
3472 void __kmp_init_random(kmp_info_t *thread) {
3473   unsigned seed = thread->th.th_info.ds.ds_tid;
3474 
3475   thread->th.th_a =
3476       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3477   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3478   KA_TRACE(30,
3479            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3480 }
3481 
3482 #if KMP_OS_WINDOWS
3483 /* reclaim array entries for root threads that are already dead, returns number
3484  * reclaimed */
3485 static int __kmp_reclaim_dead_roots(void) {
3486   int i, r = 0;
3487 
3488   for (i = 0; i < __kmp_threads_capacity; ++i) {
3489     if (KMP_UBER_GTID(i) &&
3490         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3491         !__kmp_root[i]
3492              ->r.r_active) { // AC: reclaim only roots died in non-active state
3493       r += __kmp_unregister_root_other_thread(i);
3494     }
3495   }
3496   return r;
3497 }
3498 #endif
3499 
3500 /* This function attempts to create free entries in __kmp_threads and
3501    __kmp_root, and returns the number of free entries generated.
3502 
3503    For Windows* OS static library, the first mechanism used is to reclaim array
3504    entries for root threads that are already dead.
3505 
3506    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3507    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3508    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3509    threadprivate cache array has been created. Synchronization with
3510    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3511 
3512    After any dead root reclamation, if the clipping value allows array expansion
3513    to result in the generation of a total of nNeed free slots, the function does
3514    that expansion. If not, nothing is done beyond the possible initial root
3515    thread reclamation.
3516 
3517    If any argument is negative, the behavior is undefined. */
3518 static int __kmp_expand_threads(int nNeed) {
3519   int added = 0;
3520   int minimumRequiredCapacity;
3521   int newCapacity;
3522   kmp_info_t **newThreads;
3523   kmp_root_t **newRoot;
3524 
3525 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3526 // resizing __kmp_threads does not need additional protection if foreign
3527 // threads are present
3528 
3529 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3530   /* only for Windows static library */
3531   /* reclaim array entries for root threads that are already dead */
3532   added = __kmp_reclaim_dead_roots();
3533 
3534   if (nNeed) {
3535     nNeed -= added;
3536     if (nNeed < 0)
3537       nNeed = 0;
3538   }
3539 #endif
3540   if (nNeed <= 0)
3541     return added;
3542 
3543   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3544   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3545   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3546   // > __kmp_max_nth in one of two ways:
3547   //
3548   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3549   //    may not be reused by another thread, so we may need to increase
3550   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3551   //
3552   // 2) New foreign root(s) are encountered.  We always register new foreign
3553   //    roots. This may cause a smaller # of threads to be allocated at
3554   //    subsequent parallel regions, but the worker threads hang around (and
3555   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3556   //
3557   // Anyway, that is the reason for moving the check to see if
3558   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3559   // instead of having it performed here. -BB
3560 
3561   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3562 
3563   /* compute expansion headroom to check if we can expand */
3564   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3565     /* possible expansion too small -- give up */
3566     return added;
3567   }
3568   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3569 
3570   newCapacity = __kmp_threads_capacity;
3571   do {
3572     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3573                                                           : __kmp_sys_max_nth;
3574   } while (newCapacity < minimumRequiredCapacity);
3575   newThreads = (kmp_info_t **)__kmp_allocate(
3576       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3577   newRoot =
3578       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3579   KMP_MEMCPY(newThreads, __kmp_threads,
3580              __kmp_threads_capacity * sizeof(kmp_info_t *));
3581   KMP_MEMCPY(newRoot, __kmp_root,
3582              __kmp_threads_capacity * sizeof(kmp_root_t *));
3583 
3584   kmp_info_t **temp_threads = __kmp_threads;
3585   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3586   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3587   __kmp_free(temp_threads);
3588   added += newCapacity - __kmp_threads_capacity;
3589   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3590 
3591   if (newCapacity > __kmp_tp_capacity) {
3592     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3593     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3594       __kmp_threadprivate_resize_cache(newCapacity);
3595     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3596       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3597     }
3598     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3599   }
3600 
3601   return added;
3602 }
3603 
3604 /* Register the current thread as a root thread and obtain our gtid. We must
3605    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3606    thread that calls from __kmp_do_serial_initialize() */
3607 int __kmp_register_root(int initial_thread) {
3608   kmp_info_t *root_thread;
3609   kmp_root_t *root;
3610   int gtid;
3611   int capacity;
3612   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3613   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3614   KMP_MB();
3615 
3616   /* 2007-03-02:
3617      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3618      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3619      work as expected -- it may return false (that means there is at least one
3620      empty slot in __kmp_threads array), but it is possible the only free slot
3621      is #0, which is reserved for initial thread and so cannot be used for this
3622      one. Following code workarounds this bug.
3623 
3624      However, right solution seems to be not reserving slot #0 for initial
3625      thread because:
3626      (1) there is no magic in slot #0,
3627      (2) we cannot detect initial thread reliably (the first thread which does
3628         serial initialization may be not a real initial thread).
3629   */
3630   capacity = __kmp_threads_capacity;
3631   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3632     --capacity;
3633   }
3634 
3635   /* see if there are too many threads */
3636   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3637     if (__kmp_tp_cached) {
3638       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3639                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3640                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3641     } else {
3642       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3643                   __kmp_msg_null);
3644     }
3645   }
3646 
3647   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3648   // 0: initial thread, also a regular OpenMP thread.
3649   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3650   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3651   // regular OpenMP threads.
3652   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3653     // Find an available thread slot for hidden helper thread. Slots for hidden
3654     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3655     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3656                    gtid <= __kmp_hidden_helper_threads_num;
3657          gtid++)
3658       ;
3659     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3660     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3661                  "hidden helper thread: T#%d\n",
3662                  gtid));
3663   } else {
3664     /* find an available thread slot */
3665     // Don't reassign the zero slot since we need that to only be used by
3666     // initial thread. Slots for hidden helper threads should also be skipped.
3667     if (initial_thread && __kmp_threads[0] == NULL) {
3668       gtid = 0;
3669     } else {
3670       for (gtid = __kmp_hidden_helper_threads_num + 1;
3671            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3672         ;
3673     }
3674     KA_TRACE(
3675         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3676     KMP_ASSERT(gtid < __kmp_threads_capacity);
3677   }
3678 
3679   /* update global accounting */
3680   __kmp_all_nth++;
3681   TCW_4(__kmp_nth, __kmp_nth + 1);
3682 
3683   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3684   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3685   if (__kmp_adjust_gtid_mode) {
3686     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3687       if (TCR_4(__kmp_gtid_mode) != 2) {
3688         TCW_4(__kmp_gtid_mode, 2);
3689       }
3690     } else {
3691       if (TCR_4(__kmp_gtid_mode) != 1) {
3692         TCW_4(__kmp_gtid_mode, 1);
3693       }
3694     }
3695   }
3696 
3697 #ifdef KMP_ADJUST_BLOCKTIME
3698   /* Adjust blocktime to zero if necessary            */
3699   /* Middle initialization might not have occurred yet */
3700   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3701     if (__kmp_nth > __kmp_avail_proc) {
3702       __kmp_zero_bt = TRUE;
3703     }
3704   }
3705 #endif /* KMP_ADJUST_BLOCKTIME */
3706 
3707   /* setup this new hierarchy */
3708   if (!(root = __kmp_root[gtid])) {
3709     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3710     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3711   }
3712 
3713 #if KMP_STATS_ENABLED
3714   // Initialize stats as soon as possible (right after gtid assignment).
3715   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3716   __kmp_stats_thread_ptr->startLife();
3717   KMP_SET_THREAD_STATE(SERIAL_REGION);
3718   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3719 #endif
3720   __kmp_initialize_root(root);
3721 
3722   /* setup new root thread structure */
3723   if (root->r.r_uber_thread) {
3724     root_thread = root->r.r_uber_thread;
3725   } else {
3726     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3727     if (__kmp_storage_map) {
3728       __kmp_print_thread_storage_map(root_thread, gtid);
3729     }
3730     root_thread->th.th_info.ds.ds_gtid = gtid;
3731 #if OMPT_SUPPORT
3732     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3733 #endif
3734     root_thread->th.th_root = root;
3735     if (__kmp_env_consistency_check) {
3736       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3737     }
3738 #if USE_FAST_MEMORY
3739     __kmp_initialize_fast_memory(root_thread);
3740 #endif /* USE_FAST_MEMORY */
3741 
3742 #if KMP_USE_BGET
3743     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3744     __kmp_initialize_bget(root_thread);
3745 #endif
3746     __kmp_init_random(root_thread); // Initialize random number generator
3747   }
3748 
3749   /* setup the serial team held in reserve by the root thread */
3750   if (!root_thread->th.th_serial_team) {
3751     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3752     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3753     root_thread->th.th_serial_team = __kmp_allocate_team(
3754         root, 1, 1,
3755 #if OMPT_SUPPORT
3756         ompt_data_none, // root parallel id
3757 #endif
3758         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3759   }
3760   KMP_ASSERT(root_thread->th.th_serial_team);
3761   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3762                 root_thread->th.th_serial_team));
3763 
3764   /* drop root_thread into place */
3765   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3766 
3767   root->r.r_root_team->t.t_threads[0] = root_thread;
3768   root->r.r_hot_team->t.t_threads[0] = root_thread;
3769   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3770   // AC: the team created in reserve, not for execution (it is unused for now).
3771   root_thread->th.th_serial_team->t.t_serialized = 0;
3772   root->r.r_uber_thread = root_thread;
3773 
3774   /* initialize the thread, get it ready to go */
3775   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3776   TCW_4(__kmp_init_gtid, TRUE);
3777 
3778   /* prepare the master thread for get_gtid() */
3779   __kmp_gtid_set_specific(gtid);
3780 
3781 #if USE_ITT_BUILD
3782   __kmp_itt_thread_name(gtid);
3783 #endif /* USE_ITT_BUILD */
3784 
3785 #ifdef KMP_TDATA_GTID
3786   __kmp_gtid = gtid;
3787 #endif
3788   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3789   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3790 
3791   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3792                 "plain=%u\n",
3793                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3794                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3795                 KMP_INIT_BARRIER_STATE));
3796   { // Initialize barrier data.
3797     int b;
3798     for (b = 0; b < bs_last_barrier; ++b) {
3799       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3800 #if USE_DEBUGGER
3801       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3802 #endif
3803     }
3804   }
3805   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3806                    KMP_INIT_BARRIER_STATE);
3807 
3808 #if KMP_AFFINITY_SUPPORTED
3809   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3810   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3811   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3812   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3813   if (TCR_4(__kmp_init_middle)) {
3814     __kmp_affinity_set_init_mask(gtid, TRUE);
3815   }
3816 #endif /* KMP_AFFINITY_SUPPORTED */
3817   root_thread->th.th_def_allocator = __kmp_def_allocator;
3818   root_thread->th.th_prev_level = 0;
3819   root_thread->th.th_prev_num_threads = 1;
3820 
3821   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3822   tmp->cg_root = root_thread;
3823   tmp->cg_thread_limit = __kmp_cg_max_nth;
3824   tmp->cg_nthreads = 1;
3825   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3826                  " cg_nthreads init to 1\n",
3827                  root_thread, tmp));
3828   tmp->up = NULL;
3829   root_thread->th.th_cg_roots = tmp;
3830 
3831   __kmp_root_counter++;
3832 
3833 #if OMPT_SUPPORT
3834   if (!initial_thread && ompt_enabled.enabled) {
3835 
3836     kmp_info_t *root_thread = ompt_get_thread();
3837 
3838     ompt_set_thread_state(root_thread, ompt_state_overhead);
3839 
3840     if (ompt_enabled.ompt_callback_thread_begin) {
3841       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3842           ompt_thread_initial, __ompt_get_thread_data_internal());
3843     }
3844     ompt_data_t *task_data;
3845     ompt_data_t *parallel_data;
3846     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3847     if (ompt_enabled.ompt_callback_implicit_task) {
3848       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3849           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3850     }
3851 
3852     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3853   }
3854 #endif
3855 
3856   KMP_MB();
3857   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3858 
3859   return gtid;
3860 }
3861 
3862 #if KMP_NESTED_HOT_TEAMS
3863 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3864                                 const int max_level) {
3865   int i, n, nth;
3866   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3867   if (!hot_teams || !hot_teams[level].hot_team) {
3868     return 0;
3869   }
3870   KMP_DEBUG_ASSERT(level < max_level);
3871   kmp_team_t *team = hot_teams[level].hot_team;
3872   nth = hot_teams[level].hot_team_nth;
3873   n = nth - 1; // master is not freed
3874   if (level < max_level - 1) {
3875     for (i = 0; i < nth; ++i) {
3876       kmp_info_t *th = team->t.t_threads[i];
3877       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3878       if (i > 0 && th->th.th_hot_teams) {
3879         __kmp_free(th->th.th_hot_teams);
3880         th->th.th_hot_teams = NULL;
3881       }
3882     }
3883   }
3884   __kmp_free_team(root, team, NULL);
3885   return n;
3886 }
3887 #endif
3888 
3889 // Resets a root thread and clear its root and hot teams.
3890 // Returns the number of __kmp_threads entries directly and indirectly freed.
3891 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3892   kmp_team_t *root_team = root->r.r_root_team;
3893   kmp_team_t *hot_team = root->r.r_hot_team;
3894   int n = hot_team->t.t_nproc;
3895   int i;
3896 
3897   KMP_DEBUG_ASSERT(!root->r.r_active);
3898 
3899   root->r.r_root_team = NULL;
3900   root->r.r_hot_team = NULL;
3901   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3902   // before call to __kmp_free_team().
3903   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3904 #if KMP_NESTED_HOT_TEAMS
3905   if (__kmp_hot_teams_max_level >
3906       0) { // need to free nested hot teams and their threads if any
3907     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3908       kmp_info_t *th = hot_team->t.t_threads[i];
3909       if (__kmp_hot_teams_max_level > 1) {
3910         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3911       }
3912       if (th->th.th_hot_teams) {
3913         __kmp_free(th->th.th_hot_teams);
3914         th->th.th_hot_teams = NULL;
3915       }
3916     }
3917   }
3918 #endif
3919   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3920 
3921   // Before we can reap the thread, we need to make certain that all other
3922   // threads in the teams that had this root as ancestor have stopped trying to
3923   // steal tasks.
3924   if (__kmp_tasking_mode != tskm_immediate_exec) {
3925     __kmp_wait_to_unref_task_teams();
3926   }
3927 
3928 #if KMP_OS_WINDOWS
3929   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3930   KA_TRACE(
3931       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3932            "\n",
3933            (LPVOID) & (root->r.r_uber_thread->th),
3934            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3935   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3936 #endif /* KMP_OS_WINDOWS */
3937 
3938 #if OMPT_SUPPORT
3939   ompt_data_t *task_data;
3940   ompt_data_t *parallel_data;
3941   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3942   if (ompt_enabled.ompt_callback_implicit_task) {
3943     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3944         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3945   }
3946   if (ompt_enabled.ompt_callback_thread_end) {
3947     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3948         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3949   }
3950 #endif
3951 
3952   TCW_4(__kmp_nth,
3953         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3954   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3955   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3956                  " to %d\n",
3957                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3958                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3959   if (i == 1) {
3960     // need to free contention group structure
3961     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3962                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3963     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3964     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3965     root->r.r_uber_thread->th.th_cg_roots = NULL;
3966   }
3967   __kmp_reap_thread(root->r.r_uber_thread, 1);
3968 
3969   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3970   // instead of freeing.
3971   root->r.r_uber_thread = NULL;
3972   /* mark root as no longer in use */
3973   root->r.r_begin = FALSE;
3974 
3975   return n;
3976 }
3977 
3978 void __kmp_unregister_root_current_thread(int gtid) {
3979   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3980   /* this lock should be ok, since unregister_root_current_thread is never
3981      called during an abort, only during a normal close. furthermore, if you
3982      have the forkjoin lock, you should never try to get the initz lock */
3983   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3984   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3985     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3986                   "exiting T#%d\n",
3987                   gtid));
3988     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3989     return;
3990   }
3991   kmp_root_t *root = __kmp_root[gtid];
3992 
3993   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3994   KMP_ASSERT(KMP_UBER_GTID(gtid));
3995   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3996   KMP_ASSERT(root->r.r_active == FALSE);
3997 
3998   KMP_MB();
3999 
4000   kmp_info_t *thread = __kmp_threads[gtid];
4001   kmp_team_t *team = thread->th.th_team;
4002   kmp_task_team_t *task_team = thread->th.th_task_team;
4003 
4004   // we need to wait for the proxy tasks before finishing the thread
4005   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4006 #if OMPT_SUPPORT
4007     // the runtime is shutting down so we won't report any events
4008     thread->th.ompt_thread_info.state = ompt_state_undefined;
4009 #endif
4010     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4011   }
4012 
4013   __kmp_reset_root(gtid, root);
4014 
4015   KMP_MB();
4016   KC_TRACE(10,
4017            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4018 
4019   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4020 }
4021 
4022 #if KMP_OS_WINDOWS
4023 /* __kmp_forkjoin_lock must be already held
4024    Unregisters a root thread that is not the current thread.  Returns the number
4025    of __kmp_threads entries freed as a result. */
4026 static int __kmp_unregister_root_other_thread(int gtid) {
4027   kmp_root_t *root = __kmp_root[gtid];
4028   int r;
4029 
4030   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4031   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4032   KMP_ASSERT(KMP_UBER_GTID(gtid));
4033   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4034   KMP_ASSERT(root->r.r_active == FALSE);
4035 
4036   r = __kmp_reset_root(gtid, root);
4037   KC_TRACE(10,
4038            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4039   return r;
4040 }
4041 #endif
4042 
4043 #if KMP_DEBUG
4044 void __kmp_task_info() {
4045 
4046   kmp_int32 gtid = __kmp_entry_gtid();
4047   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4048   kmp_info_t *this_thr = __kmp_threads[gtid];
4049   kmp_team_t *steam = this_thr->th.th_serial_team;
4050   kmp_team_t *team = this_thr->th.th_team;
4051 
4052   __kmp_printf(
4053       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4054       "ptask=%p\n",
4055       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4056       team->t.t_implicit_task_taskdata[tid].td_parent);
4057 }
4058 #endif // KMP_DEBUG
4059 
4060 /* TODO optimize with one big memclr, take out what isn't needed, split
4061    responsibility to workers as much as possible, and delay initialization of
4062    features as much as possible  */
4063 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4064                                   int tid, int gtid) {
4065   /* this_thr->th.th_info.ds.ds_gtid is setup in
4066      kmp_allocate_thread/create_worker.
4067      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4068   kmp_info_t *master = team->t.t_threads[0];
4069   KMP_DEBUG_ASSERT(this_thr != NULL);
4070   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4071   KMP_DEBUG_ASSERT(team);
4072   KMP_DEBUG_ASSERT(team->t.t_threads);
4073   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4074   KMP_DEBUG_ASSERT(master);
4075   KMP_DEBUG_ASSERT(master->th.th_root);
4076 
4077   KMP_MB();
4078 
4079   TCW_SYNC_PTR(this_thr->th.th_team, team);
4080 
4081   this_thr->th.th_info.ds.ds_tid = tid;
4082   this_thr->th.th_set_nproc = 0;
4083   if (__kmp_tasking_mode != tskm_immediate_exec)
4084     // When tasking is possible, threads are not safe to reap until they are
4085     // done tasking; this will be set when tasking code is exited in wait
4086     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4087   else // no tasking --> always safe to reap
4088     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4089   this_thr->th.th_set_proc_bind = proc_bind_default;
4090 #if KMP_AFFINITY_SUPPORTED
4091   this_thr->th.th_new_place = this_thr->th.th_current_place;
4092 #endif
4093   this_thr->th.th_root = master->th.th_root;
4094 
4095   /* setup the thread's cache of the team structure */
4096   this_thr->th.th_team_nproc = team->t.t_nproc;
4097   this_thr->th.th_team_master = master;
4098   this_thr->th.th_team_serialized = team->t.t_serialized;
4099   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4100 
4101   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4102 
4103   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4104                 tid, gtid, this_thr, this_thr->th.th_current_task));
4105 
4106   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4107                            team, tid, TRUE);
4108 
4109   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4110                 tid, gtid, this_thr, this_thr->th.th_current_task));
4111   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4112   // __kmp_initialize_team()?
4113 
4114   /* TODO no worksharing in speculative threads */
4115   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4116 
4117   this_thr->th.th_local.this_construct = 0;
4118 
4119   if (!this_thr->th.th_pri_common) {
4120     this_thr->th.th_pri_common =
4121         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4122     if (__kmp_storage_map) {
4123       __kmp_print_storage_map_gtid(
4124           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4125           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4126     }
4127     this_thr->th.th_pri_head = NULL;
4128   }
4129 
4130   if (this_thr != master && // Master's CG root is initialized elsewhere
4131       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4132     // Make new thread's CG root same as master's
4133     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4134     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4135     if (tmp) {
4136       // worker changes CG, need to check if old CG should be freed
4137       int i = tmp->cg_nthreads--;
4138       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4139                      " on node %p of thread %p to %d\n",
4140                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4141       if (i == 1) {
4142         __kmp_free(tmp); // last thread left CG --> free it
4143       }
4144     }
4145     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4146     // Increment new thread's CG root's counter to add the new thread
4147     this_thr->th.th_cg_roots->cg_nthreads++;
4148     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4149                    " node %p of thread %p to %d\n",
4150                    this_thr, this_thr->th.th_cg_roots,
4151                    this_thr->th.th_cg_roots->cg_root,
4152                    this_thr->th.th_cg_roots->cg_nthreads));
4153     this_thr->th.th_current_task->td_icvs.thread_limit =
4154         this_thr->th.th_cg_roots->cg_thread_limit;
4155   }
4156 
4157   /* Initialize dynamic dispatch */
4158   {
4159     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4160     // Use team max_nproc since this will never change for the team.
4161     size_t disp_size =
4162         sizeof(dispatch_private_info_t) *
4163         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4164     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4165                   team->t.t_max_nproc));
4166     KMP_ASSERT(dispatch);
4167     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4168     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4169 
4170     dispatch->th_disp_index = 0;
4171     dispatch->th_doacross_buf_idx = 0;
4172     if (!dispatch->th_disp_buffer) {
4173       dispatch->th_disp_buffer =
4174           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4175 
4176       if (__kmp_storage_map) {
4177         __kmp_print_storage_map_gtid(
4178             gtid, &dispatch->th_disp_buffer[0],
4179             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4180                                           ? 1
4181                                           : __kmp_dispatch_num_buffers],
4182             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4183                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4184             gtid, team->t.t_id, gtid);
4185       }
4186     } else {
4187       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4188     }
4189 
4190     dispatch->th_dispatch_pr_current = 0;
4191     dispatch->th_dispatch_sh_current = 0;
4192 
4193     dispatch->th_deo_fcn = 0; /* ORDERED     */
4194     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4195   }
4196 
4197   this_thr->th.th_next_pool = NULL;
4198 
4199   if (!this_thr->th.th_task_state_memo_stack) {
4200     size_t i;
4201     this_thr->th.th_task_state_memo_stack =
4202         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4203     this_thr->th.th_task_state_top = 0;
4204     this_thr->th.th_task_state_stack_sz = 4;
4205     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4206          ++i) // zero init the stack
4207       this_thr->th.th_task_state_memo_stack[i] = 0;
4208   }
4209 
4210   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4211   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4212 
4213   KMP_MB();
4214 }
4215 
4216 /* allocate a new thread for the requesting team. this is only called from
4217    within a forkjoin critical section. we will first try to get an available
4218    thread from the thread pool. if none is available, we will fork a new one
4219    assuming we are able to create a new one. this should be assured, as the
4220    caller should check on this first. */
4221 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4222                                   int new_tid) {
4223   kmp_team_t *serial_team;
4224   kmp_info_t *new_thr;
4225   int new_gtid;
4226 
4227   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4228   KMP_DEBUG_ASSERT(root && team);
4229 #if !KMP_NESTED_HOT_TEAMS
4230   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4231 #endif
4232   KMP_MB();
4233 
4234   /* first, try to get one from the thread pool */
4235   if (__kmp_thread_pool) {
4236     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4237     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4238     if (new_thr == __kmp_thread_pool_insert_pt) {
4239       __kmp_thread_pool_insert_pt = NULL;
4240     }
4241     TCW_4(new_thr->th.th_in_pool, FALSE);
4242     __kmp_suspend_initialize_thread(new_thr);
4243     __kmp_lock_suspend_mx(new_thr);
4244     if (new_thr->th.th_active_in_pool == TRUE) {
4245       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4246       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4247       new_thr->th.th_active_in_pool = FALSE;
4248     }
4249     __kmp_unlock_suspend_mx(new_thr);
4250 
4251     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4252                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4253     KMP_ASSERT(!new_thr->th.th_team);
4254     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4255 
4256     /* setup the thread structure */
4257     __kmp_initialize_info(new_thr, team, new_tid,
4258                           new_thr->th.th_info.ds.ds_gtid);
4259     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4260 
4261     TCW_4(__kmp_nth, __kmp_nth + 1);
4262 
4263     new_thr->th.th_task_state = 0;
4264     new_thr->th.th_task_state_top = 0;
4265     new_thr->th.th_task_state_stack_sz = 4;
4266 
4267 #ifdef KMP_ADJUST_BLOCKTIME
4268     /* Adjust blocktime back to zero if necessary */
4269     /* Middle initialization might not have occurred yet */
4270     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4271       if (__kmp_nth > __kmp_avail_proc) {
4272         __kmp_zero_bt = TRUE;
4273       }
4274     }
4275 #endif /* KMP_ADJUST_BLOCKTIME */
4276 
4277 #if KMP_DEBUG
4278     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4279     // KMP_BARRIER_PARENT_FLAG.
4280     int b;
4281     kmp_balign_t *balign = new_thr->th.th_bar;
4282     for (b = 0; b < bs_last_barrier; ++b)
4283       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4284 #endif
4285 
4286     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4287                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4288 
4289     KMP_MB();
4290     return new_thr;
4291   }
4292 
4293   /* no, well fork a new one */
4294   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4295   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4296 
4297 #if KMP_USE_MONITOR
4298   // If this is the first worker thread the RTL is creating, then also
4299   // launch the monitor thread.  We try to do this as early as possible.
4300   if (!TCR_4(__kmp_init_monitor)) {
4301     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4302     if (!TCR_4(__kmp_init_monitor)) {
4303       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4304       TCW_4(__kmp_init_monitor, 1);
4305       __kmp_create_monitor(&__kmp_monitor);
4306       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4307 #if KMP_OS_WINDOWS
4308       // AC: wait until monitor has started. This is a fix for CQ232808.
4309       // The reason is that if the library is loaded/unloaded in a loop with
4310       // small (parallel) work in between, then there is high probability that
4311       // monitor thread started after the library shutdown. At shutdown it is
4312       // too late to cope with the problem, because when the master is in
4313       // DllMain (process detach) the monitor has no chances to start (it is
4314       // blocked), and master has no means to inform the monitor that the
4315       // library has gone, because all the memory which the monitor can access
4316       // is going to be released/reset.
4317       while (TCR_4(__kmp_init_monitor) < 2) {
4318         KMP_YIELD(TRUE);
4319       }
4320       KF_TRACE(10, ("after monitor thread has started\n"));
4321 #endif
4322     }
4323     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4324   }
4325 #endif
4326 
4327   KMP_MB();
4328 
4329   {
4330     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4331                              ? 1
4332                              : __kmp_hidden_helper_threads_num + 1;
4333 
4334     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4335          ++new_gtid) {
4336       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4337     }
4338 
4339     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4340       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4341     }
4342   }
4343 
4344   /* allocate space for it. */
4345   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4346 
4347   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4348 
4349 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4350   // suppress race conditions detection on synchronization flags in debug mode
4351   // this helps to analyze library internals eliminating false positives
4352   __itt_suppress_mark_range(
4353       __itt_suppress_range, __itt_suppress_threading_errors,
4354       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4355   __itt_suppress_mark_range(
4356       __itt_suppress_range, __itt_suppress_threading_errors,
4357       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4358 #if KMP_OS_WINDOWS
4359   __itt_suppress_mark_range(
4360       __itt_suppress_range, __itt_suppress_threading_errors,
4361       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4362 #else
4363   __itt_suppress_mark_range(__itt_suppress_range,
4364                             __itt_suppress_threading_errors,
4365                             &new_thr->th.th_suspend_init_count,
4366                             sizeof(new_thr->th.th_suspend_init_count));
4367 #endif
4368   // TODO: check if we need to also suppress b_arrived flags
4369   __itt_suppress_mark_range(__itt_suppress_range,
4370                             __itt_suppress_threading_errors,
4371                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4372                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4373   __itt_suppress_mark_range(__itt_suppress_range,
4374                             __itt_suppress_threading_errors,
4375                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4376                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4377   __itt_suppress_mark_range(__itt_suppress_range,
4378                             __itt_suppress_threading_errors,
4379                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4380                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4381 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4382   if (__kmp_storage_map) {
4383     __kmp_print_thread_storage_map(new_thr, new_gtid);
4384   }
4385 
4386   // add the reserve serialized team, initialized from the team's master thread
4387   {
4388     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4389     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4390     new_thr->th.th_serial_team = serial_team =
4391         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4392 #if OMPT_SUPPORT
4393                                           ompt_data_none, // root parallel id
4394 #endif
4395                                           proc_bind_default, &r_icvs,
4396                                           0 USE_NESTED_HOT_ARG(NULL));
4397   }
4398   KMP_ASSERT(serial_team);
4399   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4400   // execution (it is unused for now).
4401   serial_team->t.t_threads[0] = new_thr;
4402   KF_TRACE(10,
4403            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4404             new_thr));
4405 
4406   /* setup the thread structures */
4407   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4408 
4409 #if USE_FAST_MEMORY
4410   __kmp_initialize_fast_memory(new_thr);
4411 #endif /* USE_FAST_MEMORY */
4412 
4413 #if KMP_USE_BGET
4414   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4415   __kmp_initialize_bget(new_thr);
4416 #endif
4417 
4418   __kmp_init_random(new_thr); // Initialize random number generator
4419 
4420   /* Initialize these only once when thread is grabbed for a team allocation */
4421   KA_TRACE(20,
4422            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4423             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4424 
4425   int b;
4426   kmp_balign_t *balign = new_thr->th.th_bar;
4427   for (b = 0; b < bs_last_barrier; ++b) {
4428     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4429     balign[b].bb.team = NULL;
4430     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4431     balign[b].bb.use_oncore_barrier = 0;
4432   }
4433 
4434   new_thr->th.th_spin_here = FALSE;
4435   new_thr->th.th_next_waiting = 0;
4436 #if KMP_OS_UNIX
4437   new_thr->th.th_blocking = false;
4438 #endif
4439 
4440 #if KMP_AFFINITY_SUPPORTED
4441   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4442   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4443   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4444   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4445 #endif
4446   new_thr->th.th_def_allocator = __kmp_def_allocator;
4447   new_thr->th.th_prev_level = 0;
4448   new_thr->th.th_prev_num_threads = 1;
4449 
4450   TCW_4(new_thr->th.th_in_pool, FALSE);
4451   new_thr->th.th_active_in_pool = FALSE;
4452   TCW_4(new_thr->th.th_active, TRUE);
4453 
4454   /* adjust the global counters */
4455   __kmp_all_nth++;
4456   __kmp_nth++;
4457 
4458   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4459   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4460   if (__kmp_adjust_gtid_mode) {
4461     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4462       if (TCR_4(__kmp_gtid_mode) != 2) {
4463         TCW_4(__kmp_gtid_mode, 2);
4464       }
4465     } else {
4466       if (TCR_4(__kmp_gtid_mode) != 1) {
4467         TCW_4(__kmp_gtid_mode, 1);
4468       }
4469     }
4470   }
4471 
4472 #ifdef KMP_ADJUST_BLOCKTIME
4473   /* Adjust blocktime back to zero if necessary       */
4474   /* Middle initialization might not have occurred yet */
4475   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4476     if (__kmp_nth > __kmp_avail_proc) {
4477       __kmp_zero_bt = TRUE;
4478     }
4479   }
4480 #endif /* KMP_ADJUST_BLOCKTIME */
4481 
4482   /* actually fork it and create the new worker thread */
4483   KF_TRACE(
4484       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4485   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4486   KF_TRACE(10,
4487            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4488 
4489   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4490                 new_gtid));
4491   KMP_MB();
4492   return new_thr;
4493 }
4494 
4495 /* Reinitialize team for reuse.
4496    The hot team code calls this case at every fork barrier, so EPCC barrier
4497    test are extremely sensitive to changes in it, esp. writes to the team
4498    struct, which cause a cache invalidation in all threads.
4499    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4500 static void __kmp_reinitialize_team(kmp_team_t *team,
4501                                     kmp_internal_control_t *new_icvs,
4502                                     ident_t *loc) {
4503   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4504                 team->t.t_threads[0], team));
4505   KMP_DEBUG_ASSERT(team && new_icvs);
4506   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4507   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4508 
4509   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4510   // Copy ICVs to the master thread's implicit taskdata
4511   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4512   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4513 
4514   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4515                 team->t.t_threads[0], team));
4516 }
4517 
4518 /* Initialize the team data structure.
4519    This assumes the t_threads and t_max_nproc are already set.
4520    Also, we don't touch the arguments */
4521 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4522                                   kmp_internal_control_t *new_icvs,
4523                                   ident_t *loc) {
4524   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4525 
4526   /* verify */
4527   KMP_DEBUG_ASSERT(team);
4528   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4529   KMP_DEBUG_ASSERT(team->t.t_threads);
4530   KMP_MB();
4531 
4532   team->t.t_master_tid = 0; /* not needed */
4533   /* team->t.t_master_bar;        not needed */
4534   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4535   team->t.t_nproc = new_nproc;
4536 
4537   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4538   team->t.t_next_pool = NULL;
4539   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4540    * up hot team */
4541 
4542   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4543   team->t.t_invoke = NULL; /* not needed */
4544 
4545   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4546   team->t.t_sched.sched = new_icvs->sched.sched;
4547 
4548 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4549   team->t.t_fp_control_saved = FALSE; /* not needed */
4550   team->t.t_x87_fpu_control_word = 0; /* not needed */
4551   team->t.t_mxcsr = 0; /* not needed */
4552 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4553 
4554   team->t.t_construct = 0;
4555 
4556   team->t.t_ordered.dt.t_value = 0;
4557   team->t.t_master_active = FALSE;
4558 
4559 #ifdef KMP_DEBUG
4560   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4561 #endif
4562 #if KMP_OS_WINDOWS
4563   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4564 #endif
4565 
4566   team->t.t_control_stack_top = NULL;
4567 
4568   __kmp_reinitialize_team(team, new_icvs, loc);
4569 
4570   KMP_MB();
4571   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4572 }
4573 
4574 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4575 /* Sets full mask for thread and returns old mask, no changes to structures. */
4576 static void
4577 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4578   if (KMP_AFFINITY_CAPABLE()) {
4579     int status;
4580     if (old_mask != NULL) {
4581       status = __kmp_get_system_affinity(old_mask, TRUE);
4582       int error = errno;
4583       if (status != 0) {
4584         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4585                     __kmp_msg_null);
4586       }
4587     }
4588     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4589   }
4590 }
4591 #endif
4592 
4593 #if KMP_AFFINITY_SUPPORTED
4594 
4595 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4596 // It calculates the worker + master thread's partition based upon the parent
4597 // thread's partition, and binds each worker to a thread in their partition.
4598 // The master thread's partition should already include its current binding.
4599 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4600   // Copy the master thread's place partition to the team struct
4601   kmp_info_t *master_th = team->t.t_threads[0];
4602   KMP_DEBUG_ASSERT(master_th != NULL);
4603   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4604   int first_place = master_th->th.th_first_place;
4605   int last_place = master_th->th.th_last_place;
4606   int masters_place = master_th->th.th_current_place;
4607   team->t.t_first_place = first_place;
4608   team->t.t_last_place = last_place;
4609 
4610   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4611                 "bound to place %d partition = [%d,%d]\n",
4612                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4613                 team->t.t_id, masters_place, first_place, last_place));
4614 
4615   switch (proc_bind) {
4616 
4617   case proc_bind_default:
4618     // serial teams might have the proc_bind policy set to proc_bind_default. It
4619     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4620     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4621     break;
4622 
4623   case proc_bind_master: {
4624     int f;
4625     int n_th = team->t.t_nproc;
4626     for (f = 1; f < n_th; f++) {
4627       kmp_info_t *th = team->t.t_threads[f];
4628       KMP_DEBUG_ASSERT(th != NULL);
4629       th->th.th_first_place = first_place;
4630       th->th.th_last_place = last_place;
4631       th->th.th_new_place = masters_place;
4632       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4633           team->t.t_display_affinity != 1) {
4634         team->t.t_display_affinity = 1;
4635       }
4636 
4637       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4638                      "partition = [%d,%d]\n",
4639                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4640                      f, masters_place, first_place, last_place));
4641     }
4642   } break;
4643 
4644   case proc_bind_close: {
4645     int f;
4646     int n_th = team->t.t_nproc;
4647     int n_places;
4648     if (first_place <= last_place) {
4649       n_places = last_place - first_place + 1;
4650     } else {
4651       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4652     }
4653     if (n_th <= n_places) {
4654       int place = masters_place;
4655       for (f = 1; f < n_th; f++) {
4656         kmp_info_t *th = team->t.t_threads[f];
4657         KMP_DEBUG_ASSERT(th != NULL);
4658 
4659         if (place == last_place) {
4660           place = first_place;
4661         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4662           place = 0;
4663         } else {
4664           place++;
4665         }
4666         th->th.th_first_place = first_place;
4667         th->th.th_last_place = last_place;
4668         th->th.th_new_place = place;
4669         if (__kmp_display_affinity && place != th->th.th_current_place &&
4670             team->t.t_display_affinity != 1) {
4671           team->t.t_display_affinity = 1;
4672         }
4673 
4674         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4675                        "partition = [%d,%d]\n",
4676                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4677                        team->t.t_id, f, place, first_place, last_place));
4678       }
4679     } else {
4680       int S, rem, gap, s_count;
4681       S = n_th / n_places;
4682       s_count = 0;
4683       rem = n_th - (S * n_places);
4684       gap = rem > 0 ? n_places / rem : n_places;
4685       int place = masters_place;
4686       int gap_ct = gap;
4687       for (f = 0; f < n_th; f++) {
4688         kmp_info_t *th = team->t.t_threads[f];
4689         KMP_DEBUG_ASSERT(th != NULL);
4690 
4691         th->th.th_first_place = first_place;
4692         th->th.th_last_place = last_place;
4693         th->th.th_new_place = place;
4694         if (__kmp_display_affinity && place != th->th.th_current_place &&
4695             team->t.t_display_affinity != 1) {
4696           team->t.t_display_affinity = 1;
4697         }
4698         s_count++;
4699 
4700         if ((s_count == S) && rem && (gap_ct == gap)) {
4701           // do nothing, add an extra thread to place on next iteration
4702         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4703           // we added an extra thread to this place; move to next place
4704           if (place == last_place) {
4705             place = first_place;
4706           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4707             place = 0;
4708           } else {
4709             place++;
4710           }
4711           s_count = 0;
4712           gap_ct = 1;
4713           rem--;
4714         } else if (s_count == S) { // place full; don't add extra
4715           if (place == last_place) {
4716             place = first_place;
4717           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4718             place = 0;
4719           } else {
4720             place++;
4721           }
4722           gap_ct++;
4723           s_count = 0;
4724         }
4725 
4726         KA_TRACE(100,
4727                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4728                   "partition = [%d,%d]\n",
4729                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4730                   th->th.th_new_place, first_place, last_place));
4731       }
4732       KMP_DEBUG_ASSERT(place == masters_place);
4733     }
4734   } break;
4735 
4736   case proc_bind_spread: {
4737     int f;
4738     int n_th = team->t.t_nproc;
4739     int n_places;
4740     int thidx;
4741     if (first_place <= last_place) {
4742       n_places = last_place - first_place + 1;
4743     } else {
4744       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4745     }
4746     if (n_th <= n_places) {
4747       int place = -1;
4748 
4749       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4750         int S = n_places / n_th;
4751         int s_count, rem, gap, gap_ct;
4752 
4753         place = masters_place;
4754         rem = n_places - n_th * S;
4755         gap = rem ? n_th / rem : 1;
4756         gap_ct = gap;
4757         thidx = n_th;
4758         if (update_master_only == 1)
4759           thidx = 1;
4760         for (f = 0; f < thidx; f++) {
4761           kmp_info_t *th = team->t.t_threads[f];
4762           KMP_DEBUG_ASSERT(th != NULL);
4763 
4764           th->th.th_first_place = place;
4765           th->th.th_new_place = place;
4766           if (__kmp_display_affinity && place != th->th.th_current_place &&
4767               team->t.t_display_affinity != 1) {
4768             team->t.t_display_affinity = 1;
4769           }
4770           s_count = 1;
4771           while (s_count < S) {
4772             if (place == last_place) {
4773               place = first_place;
4774             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4775               place = 0;
4776             } else {
4777               place++;
4778             }
4779             s_count++;
4780           }
4781           if (rem && (gap_ct == gap)) {
4782             if (place == last_place) {
4783               place = first_place;
4784             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4785               place = 0;
4786             } else {
4787               place++;
4788             }
4789             rem--;
4790             gap_ct = 0;
4791           }
4792           th->th.th_last_place = place;
4793           gap_ct++;
4794 
4795           if (place == last_place) {
4796             place = first_place;
4797           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4798             place = 0;
4799           } else {
4800             place++;
4801           }
4802 
4803           KA_TRACE(100,
4804                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4805                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4806                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4807                     f, th->th.th_new_place, th->th.th_first_place,
4808                     th->th.th_last_place, __kmp_affinity_num_masks));
4809         }
4810       } else {
4811         /* Having uniform space of available computation places I can create
4812            T partitions of round(P/T) size and put threads into the first
4813            place of each partition. */
4814         double current = static_cast<double>(masters_place);
4815         double spacing =
4816             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4817         int first, last;
4818         kmp_info_t *th;
4819 
4820         thidx = n_th + 1;
4821         if (update_master_only == 1)
4822           thidx = 1;
4823         for (f = 0; f < thidx; f++) {
4824           first = static_cast<int>(current);
4825           last = static_cast<int>(current + spacing) - 1;
4826           KMP_DEBUG_ASSERT(last >= first);
4827           if (first >= n_places) {
4828             if (masters_place) {
4829               first -= n_places;
4830               last -= n_places;
4831               if (first == (masters_place + 1)) {
4832                 KMP_DEBUG_ASSERT(f == n_th);
4833                 first--;
4834               }
4835               if (last == masters_place) {
4836                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4837                 last--;
4838               }
4839             } else {
4840               KMP_DEBUG_ASSERT(f == n_th);
4841               first = 0;
4842               last = 0;
4843             }
4844           }
4845           if (last >= n_places) {
4846             last = (n_places - 1);
4847           }
4848           place = first;
4849           current += spacing;
4850           if (f < n_th) {
4851             KMP_DEBUG_ASSERT(0 <= first);
4852             KMP_DEBUG_ASSERT(n_places > first);
4853             KMP_DEBUG_ASSERT(0 <= last);
4854             KMP_DEBUG_ASSERT(n_places > last);
4855             KMP_DEBUG_ASSERT(last_place >= first_place);
4856             th = team->t.t_threads[f];
4857             KMP_DEBUG_ASSERT(th);
4858             th->th.th_first_place = first;
4859             th->th.th_new_place = place;
4860             th->th.th_last_place = last;
4861             if (__kmp_display_affinity && place != th->th.th_current_place &&
4862                 team->t.t_display_affinity != 1) {
4863               team->t.t_display_affinity = 1;
4864             }
4865             KA_TRACE(100,
4866                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4867                       "partition = [%d,%d], spacing = %.4f\n",
4868                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4869                       team->t.t_id, f, th->th.th_new_place,
4870                       th->th.th_first_place, th->th.th_last_place, spacing));
4871           }
4872         }
4873       }
4874       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4875     } else {
4876       int S, rem, gap, s_count;
4877       S = n_th / n_places;
4878       s_count = 0;
4879       rem = n_th - (S * n_places);
4880       gap = rem > 0 ? n_places / rem : n_places;
4881       int place = masters_place;
4882       int gap_ct = gap;
4883       thidx = n_th;
4884       if (update_master_only == 1)
4885         thidx = 1;
4886       for (f = 0; f < thidx; f++) {
4887         kmp_info_t *th = team->t.t_threads[f];
4888         KMP_DEBUG_ASSERT(th != NULL);
4889 
4890         th->th.th_first_place = place;
4891         th->th.th_last_place = place;
4892         th->th.th_new_place = place;
4893         if (__kmp_display_affinity && place != th->th.th_current_place &&
4894             team->t.t_display_affinity != 1) {
4895           team->t.t_display_affinity = 1;
4896         }
4897         s_count++;
4898 
4899         if ((s_count == S) && rem && (gap_ct == gap)) {
4900           // do nothing, add an extra thread to place on next iteration
4901         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4902           // we added an extra thread to this place; move on to next place
4903           if (place == last_place) {
4904             place = first_place;
4905           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4906             place = 0;
4907           } else {
4908             place++;
4909           }
4910           s_count = 0;
4911           gap_ct = 1;
4912           rem--;
4913         } else if (s_count == S) { // place is full; don't add extra thread
4914           if (place == last_place) {
4915             place = first_place;
4916           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4917             place = 0;
4918           } else {
4919             place++;
4920           }
4921           gap_ct++;
4922           s_count = 0;
4923         }
4924 
4925         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4926                        "partition = [%d,%d]\n",
4927                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4928                        team->t.t_id, f, th->th.th_new_place,
4929                        th->th.th_first_place, th->th.th_last_place));
4930       }
4931       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4932     }
4933   } break;
4934 
4935   default:
4936     break;
4937   }
4938 
4939   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4940 }
4941 
4942 #endif // KMP_AFFINITY_SUPPORTED
4943 
4944 /* allocate a new team data structure to use.  take one off of the free pool if
4945    available */
4946 kmp_team_t *
4947 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4948 #if OMPT_SUPPORT
4949                     ompt_data_t ompt_parallel_data,
4950 #endif
4951                     kmp_proc_bind_t new_proc_bind,
4952                     kmp_internal_control_t *new_icvs,
4953                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4954   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4955   int f;
4956   kmp_team_t *team;
4957   int use_hot_team = !root->r.r_active;
4958   int level = 0;
4959 
4960   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4961   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4962   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4963   KMP_MB();
4964 
4965 #if KMP_NESTED_HOT_TEAMS
4966   kmp_hot_team_ptr_t *hot_teams;
4967   if (master) {
4968     team = master->th.th_team;
4969     level = team->t.t_active_level;
4970     if (master->th.th_teams_microtask) { // in teams construct?
4971       if (master->th.th_teams_size.nteams > 1 &&
4972           ( // #teams > 1
4973               team->t.t_pkfn ==
4974                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4975               master->th.th_teams_level <
4976                   team->t.t_level)) { // or nested parallel inside the teams
4977         ++level; // not increment if #teams==1, or for outer fork of the teams;
4978         // increment otherwise
4979       }
4980     }
4981     hot_teams = master->th.th_hot_teams;
4982     if (level < __kmp_hot_teams_max_level && hot_teams &&
4983         hot_teams[level].hot_team) {
4984       // hot team has already been allocated for given level
4985       use_hot_team = 1;
4986     } else {
4987       use_hot_team = 0;
4988     }
4989   } else {
4990     // check we won't access uninitialized hot_teams, just in case
4991     KMP_DEBUG_ASSERT(new_nproc == 1);
4992   }
4993 #endif
4994   // Optimization to use a "hot" team
4995   if (use_hot_team && new_nproc > 1) {
4996     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4997 #if KMP_NESTED_HOT_TEAMS
4998     team = hot_teams[level].hot_team;
4999 #else
5000     team = root->r.r_hot_team;
5001 #endif
5002 #if KMP_DEBUG
5003     if (__kmp_tasking_mode != tskm_immediate_exec) {
5004       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5005                     "task_team[1] = %p before reinit\n",
5006                     team->t.t_task_team[0], team->t.t_task_team[1]));
5007     }
5008 #endif
5009 
5010     // Has the number of threads changed?
5011     /* Let's assume the most common case is that the number of threads is
5012        unchanged, and put that case first. */
5013     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5014       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5015       // This case can mean that omp_set_num_threads() was called and the hot
5016       // team size was already reduced, so we check the special flag
5017       if (team->t.t_size_changed == -1) {
5018         team->t.t_size_changed = 1;
5019       } else {
5020         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5021       }
5022 
5023       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5024       kmp_r_sched_t new_sched = new_icvs->sched;
5025       // set master's schedule as new run-time schedule
5026       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5027 
5028       __kmp_reinitialize_team(team, new_icvs,
5029                               root->r.r_uber_thread->th.th_ident);
5030 
5031       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5032                     team->t.t_threads[0], team));
5033       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5034 
5035 #if KMP_AFFINITY_SUPPORTED
5036       if ((team->t.t_size_changed == 0) &&
5037           (team->t.t_proc_bind == new_proc_bind)) {
5038         if (new_proc_bind == proc_bind_spread) {
5039           __kmp_partition_places(
5040               team, 1); // add flag to update only master for spread
5041         }
5042         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5043                        "proc_bind = %d, partition = [%d,%d]\n",
5044                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5045                        team->t.t_last_place));
5046       } else {
5047         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5048         __kmp_partition_places(team);
5049       }
5050 #else
5051       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5052 #endif /* KMP_AFFINITY_SUPPORTED */
5053     } else if (team->t.t_nproc > new_nproc) {
5054       KA_TRACE(20,
5055                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5056                 new_nproc));
5057 
5058       team->t.t_size_changed = 1;
5059 #if KMP_NESTED_HOT_TEAMS
5060       if (__kmp_hot_teams_mode == 0) {
5061         // AC: saved number of threads should correspond to team's value in this
5062         // mode, can be bigger in mode 1, when hot team has threads in reserve
5063         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5064         hot_teams[level].hot_team_nth = new_nproc;
5065 #endif // KMP_NESTED_HOT_TEAMS
5066         /* release the extra threads we don't need any more */
5067         for (f = new_nproc; f < team->t.t_nproc; f++) {
5068           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5069           if (__kmp_tasking_mode != tskm_immediate_exec) {
5070             // When decreasing team size, threads no longer in the team should
5071             // unref task team.
5072             team->t.t_threads[f]->th.th_task_team = NULL;
5073           }
5074           __kmp_free_thread(team->t.t_threads[f]);
5075           team->t.t_threads[f] = NULL;
5076         }
5077 #if KMP_NESTED_HOT_TEAMS
5078       } // (__kmp_hot_teams_mode == 0)
5079       else {
5080         // When keeping extra threads in team, switch threads to wait on own
5081         // b_go flag
5082         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5083           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5084           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5085           for (int b = 0; b < bs_last_barrier; ++b) {
5086             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5087               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5088             }
5089             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5090           }
5091         }
5092       }
5093 #endif // KMP_NESTED_HOT_TEAMS
5094       team->t.t_nproc = new_nproc;
5095       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5096       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5097       __kmp_reinitialize_team(team, new_icvs,
5098                               root->r.r_uber_thread->th.th_ident);
5099 
5100       // Update remaining threads
5101       for (f = 0; f < new_nproc; ++f) {
5102         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5103       }
5104 
5105       // restore the current task state of the master thread: should be the
5106       // implicit task
5107       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5108                     team->t.t_threads[0], team));
5109 
5110       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5111 
5112 #ifdef KMP_DEBUG
5113       for (f = 0; f < team->t.t_nproc; f++) {
5114         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5115                          team->t.t_threads[f]->th.th_team_nproc ==
5116                              team->t.t_nproc);
5117       }
5118 #endif
5119 
5120       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5121 #if KMP_AFFINITY_SUPPORTED
5122       __kmp_partition_places(team);
5123 #endif
5124     } else { // team->t.t_nproc < new_nproc
5125 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5126       kmp_affin_mask_t *old_mask;
5127       if (KMP_AFFINITY_CAPABLE()) {
5128         KMP_CPU_ALLOC(old_mask);
5129       }
5130 #endif
5131 
5132       KA_TRACE(20,
5133                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5134                 new_nproc));
5135 
5136       team->t.t_size_changed = 1;
5137 
5138 #if KMP_NESTED_HOT_TEAMS
5139       int avail_threads = hot_teams[level].hot_team_nth;
5140       if (new_nproc < avail_threads)
5141         avail_threads = new_nproc;
5142       kmp_info_t **other_threads = team->t.t_threads;
5143       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5144         // Adjust barrier data of reserved threads (if any) of the team
5145         // Other data will be set in __kmp_initialize_info() below.
5146         int b;
5147         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5148         for (b = 0; b < bs_last_barrier; ++b) {
5149           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5150           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5151 #if USE_DEBUGGER
5152           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5153 #endif
5154         }
5155       }
5156       if (hot_teams[level].hot_team_nth >= new_nproc) {
5157         // we have all needed threads in reserve, no need to allocate any
5158         // this only possible in mode 1, cannot have reserved threads in mode 0
5159         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5160         team->t.t_nproc = new_nproc; // just get reserved threads involved
5161       } else {
5162         // we may have some threads in reserve, but not enough
5163         team->t.t_nproc =
5164             hot_teams[level]
5165                 .hot_team_nth; // get reserved threads involved if any
5166         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5167 #endif // KMP_NESTED_HOT_TEAMS
5168         if (team->t.t_max_nproc < new_nproc) {
5169           /* reallocate larger arrays */
5170           __kmp_reallocate_team_arrays(team, new_nproc);
5171           __kmp_reinitialize_team(team, new_icvs, NULL);
5172         }
5173 
5174 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5175         /* Temporarily set full mask for master thread before creation of
5176            workers. The reason is that workers inherit the affinity from master,
5177            so if a lot of workers are created on the single core quickly, they
5178            don't get a chance to set their own affinity for a long time. */
5179         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5180 #endif
5181 
5182         /* allocate new threads for the hot team */
5183         for (f = team->t.t_nproc; f < new_nproc; f++) {
5184           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5185           KMP_DEBUG_ASSERT(new_worker);
5186           team->t.t_threads[f] = new_worker;
5187 
5188           KA_TRACE(20,
5189                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5190                     "join=%llu, plain=%llu\n",
5191                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5192                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5193                     team->t.t_bar[bs_plain_barrier].b_arrived));
5194 
5195           { // Initialize barrier data for new threads.
5196             int b;
5197             kmp_balign_t *balign = new_worker->th.th_bar;
5198             for (b = 0; b < bs_last_barrier; ++b) {
5199               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5200               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5201                                KMP_BARRIER_PARENT_FLAG);
5202 #if USE_DEBUGGER
5203               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5204 #endif
5205             }
5206           }
5207         }
5208 
5209 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5210         if (KMP_AFFINITY_CAPABLE()) {
5211           /* Restore initial master thread's affinity mask */
5212           __kmp_set_system_affinity(old_mask, TRUE);
5213           KMP_CPU_FREE(old_mask);
5214         }
5215 #endif
5216 #if KMP_NESTED_HOT_TEAMS
5217       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5218 #endif // KMP_NESTED_HOT_TEAMS
5219       /* make sure everyone is syncronized */
5220       int old_nproc = team->t.t_nproc; // save old value and use to update only
5221       // new threads below
5222       __kmp_initialize_team(team, new_nproc, new_icvs,
5223                             root->r.r_uber_thread->th.th_ident);
5224 
5225       /* reinitialize the threads */
5226       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5227       for (f = 0; f < team->t.t_nproc; ++f)
5228         __kmp_initialize_info(team->t.t_threads[f], team, f,
5229                               __kmp_gtid_from_tid(f, team));
5230 
5231       if (level) { // set th_task_state for new threads in nested hot team
5232         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5233         // only need to set the th_task_state for the new threads. th_task_state
5234         // for master thread will not be accurate until after this in
5235         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5236         // correct value.
5237         for (f = old_nproc; f < team->t.t_nproc; ++f)
5238           team->t.t_threads[f]->th.th_task_state =
5239               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5240       } else { // set th_task_state for new threads in non-nested hot team
5241         kmp_uint8 old_state =
5242             team->t.t_threads[0]->th.th_task_state; // copy master's state
5243         for (f = old_nproc; f < team->t.t_nproc; ++f)
5244           team->t.t_threads[f]->th.th_task_state = old_state;
5245       }
5246 
5247 #ifdef KMP_DEBUG
5248       for (f = 0; f < team->t.t_nproc; ++f) {
5249         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5250                          team->t.t_threads[f]->th.th_team_nproc ==
5251                              team->t.t_nproc);
5252       }
5253 #endif
5254 
5255       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5256 #if KMP_AFFINITY_SUPPORTED
5257       __kmp_partition_places(team);
5258 #endif
5259     } // Check changes in number of threads
5260 
5261     kmp_info_t *master = team->t.t_threads[0];
5262     if (master->th.th_teams_microtask) {
5263       for (f = 1; f < new_nproc; ++f) {
5264         // propagate teams construct specific info to workers
5265         kmp_info_t *thr = team->t.t_threads[f];
5266         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5267         thr->th.th_teams_level = master->th.th_teams_level;
5268         thr->th.th_teams_size = master->th.th_teams_size;
5269       }
5270     }
5271 #if KMP_NESTED_HOT_TEAMS
5272     if (level) {
5273       // Sync barrier state for nested hot teams, not needed for outermost hot
5274       // team.
5275       for (f = 1; f < new_nproc; ++f) {
5276         kmp_info_t *thr = team->t.t_threads[f];
5277         int b;
5278         kmp_balign_t *balign = thr->th.th_bar;
5279         for (b = 0; b < bs_last_barrier; ++b) {
5280           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5281           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5282 #if USE_DEBUGGER
5283           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5284 #endif
5285         }
5286       }
5287     }
5288 #endif // KMP_NESTED_HOT_TEAMS
5289 
5290     /* reallocate space for arguments if necessary */
5291     __kmp_alloc_argv_entries(argc, team, TRUE);
5292     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5293     // The hot team re-uses the previous task team,
5294     // if untouched during the previous release->gather phase.
5295 
5296     KF_TRACE(10, (" hot_team = %p\n", team));
5297 
5298 #if KMP_DEBUG
5299     if (__kmp_tasking_mode != tskm_immediate_exec) {
5300       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5301                     "task_team[1] = %p after reinit\n",
5302                     team->t.t_task_team[0], team->t.t_task_team[1]));
5303     }
5304 #endif
5305 
5306 #if OMPT_SUPPORT
5307     __ompt_team_assign_id(team, ompt_parallel_data);
5308 #endif
5309 
5310     KMP_MB();
5311 
5312     return team;
5313   }
5314 
5315   /* next, let's try to take one from the team pool */
5316   KMP_MB();
5317   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5318     /* TODO: consider resizing undersized teams instead of reaping them, now
5319        that we have a resizing mechanism */
5320     if (team->t.t_max_nproc >= max_nproc) {
5321       /* take this team from the team pool */
5322       __kmp_team_pool = team->t.t_next_pool;
5323 
5324       /* setup the team for fresh use */
5325       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5326 
5327       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5328                     "task_team[1] %p to NULL\n",
5329                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5330       team->t.t_task_team[0] = NULL;
5331       team->t.t_task_team[1] = NULL;
5332 
5333       /* reallocate space for arguments if necessary */
5334       __kmp_alloc_argv_entries(argc, team, TRUE);
5335       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5336 
5337       KA_TRACE(
5338           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5339                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5340       { // Initialize barrier data.
5341         int b;
5342         for (b = 0; b < bs_last_barrier; ++b) {
5343           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5344 #if USE_DEBUGGER
5345           team->t.t_bar[b].b_master_arrived = 0;
5346           team->t.t_bar[b].b_team_arrived = 0;
5347 #endif
5348         }
5349       }
5350 
5351       team->t.t_proc_bind = new_proc_bind;
5352 
5353       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5354                     team->t.t_id));
5355 
5356 #if OMPT_SUPPORT
5357       __ompt_team_assign_id(team, ompt_parallel_data);
5358 #endif
5359 
5360       KMP_MB();
5361 
5362       return team;
5363     }
5364 
5365     /* reap team if it is too small, then loop back and check the next one */
5366     // not sure if this is wise, but, will be redone during the hot-teams
5367     // rewrite.
5368     /* TODO: Use technique to find the right size hot-team, don't reap them */
5369     team = __kmp_reap_team(team);
5370     __kmp_team_pool = team;
5371   }
5372 
5373   /* nothing available in the pool, no matter, make a new team! */
5374   KMP_MB();
5375   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5376 
5377   /* and set it up */
5378   team->t.t_max_nproc = max_nproc;
5379   /* NOTE well, for some reason allocating one big buffer and dividing it up
5380      seems to really hurt performance a lot on the P4, so, let's not use this */
5381   __kmp_allocate_team_arrays(team, max_nproc);
5382 
5383   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5384   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5385 
5386   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5387                 "%p to NULL\n",
5388                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5389   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5390   // memory, no need to duplicate
5391   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5392   // memory, no need to duplicate
5393 
5394   if (__kmp_storage_map) {
5395     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5396   }
5397 
5398   /* allocate space for arguments */
5399   __kmp_alloc_argv_entries(argc, team, FALSE);
5400   team->t.t_argc = argc;
5401 
5402   KA_TRACE(20,
5403            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5404             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5405   { // Initialize barrier data.
5406     int b;
5407     for (b = 0; b < bs_last_barrier; ++b) {
5408       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5409 #if USE_DEBUGGER
5410       team->t.t_bar[b].b_master_arrived = 0;
5411       team->t.t_bar[b].b_team_arrived = 0;
5412 #endif
5413     }
5414   }
5415 
5416   team->t.t_proc_bind = new_proc_bind;
5417 
5418 #if OMPT_SUPPORT
5419   __ompt_team_assign_id(team, ompt_parallel_data);
5420   team->t.ompt_serialized_team_info = NULL;
5421 #endif
5422 
5423   KMP_MB();
5424 
5425   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5426                 team->t.t_id));
5427 
5428   return team;
5429 }
5430 
5431 /* TODO implement hot-teams at all levels */
5432 /* TODO implement lazy thread release on demand (disband request) */
5433 
5434 /* free the team.  return it to the team pool.  release all the threads
5435  * associated with it */
5436 void __kmp_free_team(kmp_root_t *root,
5437                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5438   int f;
5439   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5440                 team->t.t_id));
5441 
5442   /* verify state */
5443   KMP_DEBUG_ASSERT(root);
5444   KMP_DEBUG_ASSERT(team);
5445   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5446   KMP_DEBUG_ASSERT(team->t.t_threads);
5447 
5448   int use_hot_team = team == root->r.r_hot_team;
5449 #if KMP_NESTED_HOT_TEAMS
5450   int level;
5451   kmp_hot_team_ptr_t *hot_teams;
5452   if (master) {
5453     level = team->t.t_active_level - 1;
5454     if (master->th.th_teams_microtask) { // in teams construct?
5455       if (master->th.th_teams_size.nteams > 1) {
5456         ++level; // level was not increased in teams construct for
5457         // team_of_masters
5458       }
5459       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5460           master->th.th_teams_level == team->t.t_level) {
5461         ++level; // level was not increased in teams construct for
5462         // team_of_workers before the parallel
5463       } // team->t.t_level will be increased inside parallel
5464     }
5465     hot_teams = master->th.th_hot_teams;
5466     if (level < __kmp_hot_teams_max_level) {
5467       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5468       use_hot_team = 1;
5469     }
5470   }
5471 #endif // KMP_NESTED_HOT_TEAMS
5472 
5473   /* team is done working */
5474   TCW_SYNC_PTR(team->t.t_pkfn,
5475                NULL); // Important for Debugging Support Library.
5476 #if KMP_OS_WINDOWS
5477   team->t.t_copyin_counter = 0; // init counter for possible reuse
5478 #endif
5479   // Do not reset pointer to parent team to NULL for hot teams.
5480 
5481   /* if we are non-hot team, release our threads */
5482   if (!use_hot_team) {
5483     if (__kmp_tasking_mode != tskm_immediate_exec) {
5484       // Wait for threads to reach reapable state
5485       for (f = 1; f < team->t.t_nproc; ++f) {
5486         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5487         kmp_info_t *th = team->t.t_threads[f];
5488         volatile kmp_uint32 *state = &th->th.th_reap_state;
5489         while (*state != KMP_SAFE_TO_REAP) {
5490 #if KMP_OS_WINDOWS
5491           // On Windows a thread can be killed at any time, check this
5492           DWORD ecode;
5493           if (!__kmp_is_thread_alive(th, &ecode)) {
5494             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5495             break;
5496           }
5497 #endif
5498           // first check if thread is sleeping
5499           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5500           if (fl.is_sleeping())
5501             fl.resume(__kmp_gtid_from_thread(th));
5502           KMP_CPU_PAUSE();
5503         }
5504       }
5505 
5506       // Delete task teams
5507       int tt_idx;
5508       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5509         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5510         if (task_team != NULL) {
5511           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5512             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5513             team->t.t_threads[f]->th.th_task_team = NULL;
5514           }
5515           KA_TRACE(
5516               20,
5517               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5518                __kmp_get_gtid(), task_team, team->t.t_id));
5519 #if KMP_NESTED_HOT_TEAMS
5520           __kmp_free_task_team(master, task_team);
5521 #endif
5522           team->t.t_task_team[tt_idx] = NULL;
5523         }
5524       }
5525     }
5526 
5527     // Reset pointer to parent team only for non-hot teams.
5528     team->t.t_parent = NULL;
5529     team->t.t_level = 0;
5530     team->t.t_active_level = 0;
5531 
5532     /* free the worker threads */
5533     for (f = 1; f < team->t.t_nproc; ++f) {
5534       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5535       __kmp_free_thread(team->t.t_threads[f]);
5536       team->t.t_threads[f] = NULL;
5537     }
5538 
5539     /* put the team back in the team pool */
5540     /* TODO limit size of team pool, call reap_team if pool too large */
5541     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5542     __kmp_team_pool = (volatile kmp_team_t *)team;
5543   } else { // Check if team was created for the masters in a teams construct
5544     // See if first worker is a CG root
5545     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5546                      team->t.t_threads[1]->th.th_cg_roots);
5547     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5548       // Clean up the CG root nodes on workers so that this team can be re-used
5549       for (f = 1; f < team->t.t_nproc; ++f) {
5550         kmp_info_t *thr = team->t.t_threads[f];
5551         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5552                          thr->th.th_cg_roots->cg_root == thr);
5553         // Pop current CG root off list
5554         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5555         thr->th.th_cg_roots = tmp->up;
5556         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5557                        " up to node %p. cg_nthreads was %d\n",
5558                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5559         int i = tmp->cg_nthreads--;
5560         if (i == 1) {
5561           __kmp_free(tmp); // free CG if we are the last thread in it
5562         }
5563         // Restore current task's thread_limit from CG root
5564         if (thr->th.th_cg_roots)
5565           thr->th.th_current_task->td_icvs.thread_limit =
5566               thr->th.th_cg_roots->cg_thread_limit;
5567       }
5568     }
5569   }
5570 
5571   KMP_MB();
5572 }
5573 
5574 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5575 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5576   kmp_team_t *next_pool = team->t.t_next_pool;
5577 
5578   KMP_DEBUG_ASSERT(team);
5579   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5580   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5581   KMP_DEBUG_ASSERT(team->t.t_threads);
5582   KMP_DEBUG_ASSERT(team->t.t_argv);
5583 
5584   /* TODO clean the threads that are a part of this? */
5585 
5586   /* free stuff */
5587   __kmp_free_team_arrays(team);
5588   if (team->t.t_argv != &team->t.t_inline_argv[0])
5589     __kmp_free((void *)team->t.t_argv);
5590   __kmp_free(team);
5591 
5592   KMP_MB();
5593   return next_pool;
5594 }
5595 
5596 // Free the thread.  Don't reap it, just place it on the pool of available
5597 // threads.
5598 //
5599 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5600 // binding for the affinity mechanism to be useful.
5601 //
5602 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5603 // However, we want to avoid a potential performance problem by always
5604 // scanning through the list to find the correct point at which to insert
5605 // the thread (potential N**2 behavior).  To do this we keep track of the
5606 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5607 // With single-level parallelism, threads will always be added to the tail
5608 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5609 // parallelism, all bets are off and we may need to scan through the entire
5610 // free list.
5611 //
5612 // This change also has a potentially large performance benefit, for some
5613 // applications.  Previously, as threads were freed from the hot team, they
5614 // would be placed back on the free list in inverse order.  If the hot team
5615 // grew back to it's original size, then the freed thread would be placed
5616 // back on the hot team in reverse order.  This could cause bad cache
5617 // locality problems on programs where the size of the hot team regularly
5618 // grew and shrunk.
5619 //
5620 // Now, for single-level parallelism, the OMP tid is always == gtid.
5621 void __kmp_free_thread(kmp_info_t *this_th) {
5622   int gtid;
5623   kmp_info_t **scan;
5624 
5625   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5626                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5627 
5628   KMP_DEBUG_ASSERT(this_th);
5629 
5630   // When moving thread to pool, switch thread to wait on own b_go flag, and
5631   // uninitialized (NULL team).
5632   int b;
5633   kmp_balign_t *balign = this_th->th.th_bar;
5634   for (b = 0; b < bs_last_barrier; ++b) {
5635     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5636       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5637     balign[b].bb.team = NULL;
5638     balign[b].bb.leaf_kids = 0;
5639   }
5640   this_th->th.th_task_state = 0;
5641   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5642 
5643   /* put thread back on the free pool */
5644   TCW_PTR(this_th->th.th_team, NULL);
5645   TCW_PTR(this_th->th.th_root, NULL);
5646   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5647 
5648   while (this_th->th.th_cg_roots) {
5649     this_th->th.th_cg_roots->cg_nthreads--;
5650     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5651                    " %p of thread  %p to %d\n",
5652                    this_th, this_th->th.th_cg_roots,
5653                    this_th->th.th_cg_roots->cg_root,
5654                    this_th->th.th_cg_roots->cg_nthreads));
5655     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5656     if (tmp->cg_root == this_th) { // Thread is a cg_root
5657       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5658       KA_TRACE(
5659           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5660       this_th->th.th_cg_roots = tmp->up;
5661       __kmp_free(tmp);
5662     } else { // Worker thread
5663       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5664         __kmp_free(tmp);
5665       }
5666       this_th->th.th_cg_roots = NULL;
5667       break;
5668     }
5669   }
5670 
5671   /* If the implicit task assigned to this thread can be used by other threads
5672    * -> multiple threads can share the data and try to free the task at
5673    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5674    * with higher probability when hot team is disabled but can occurs even when
5675    * the hot team is enabled */
5676   __kmp_free_implicit_task(this_th);
5677   this_th->th.th_current_task = NULL;
5678 
5679   // If the __kmp_thread_pool_insert_pt is already past the new insert
5680   // point, then we need to re-scan the entire list.
5681   gtid = this_th->th.th_info.ds.ds_gtid;
5682   if (__kmp_thread_pool_insert_pt != NULL) {
5683     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5684     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5685       __kmp_thread_pool_insert_pt = NULL;
5686     }
5687   }
5688 
5689   // Scan down the list to find the place to insert the thread.
5690   // scan is the address of a link in the list, possibly the address of
5691   // __kmp_thread_pool itself.
5692   //
5693   // In the absence of nested parallelism, the for loop will have 0 iterations.
5694   if (__kmp_thread_pool_insert_pt != NULL) {
5695     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5696   } else {
5697     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5698   }
5699   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5700        scan = &((*scan)->th.th_next_pool))
5701     ;
5702 
5703   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5704   // to its address.
5705   TCW_PTR(this_th->th.th_next_pool, *scan);
5706   __kmp_thread_pool_insert_pt = *scan = this_th;
5707   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5708                    (this_th->th.th_info.ds.ds_gtid <
5709                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5710   TCW_4(this_th->th.th_in_pool, TRUE);
5711   __kmp_suspend_initialize_thread(this_th);
5712   __kmp_lock_suspend_mx(this_th);
5713   if (this_th->th.th_active == TRUE) {
5714     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5715     this_th->th.th_active_in_pool = TRUE;
5716   }
5717 #if KMP_DEBUG
5718   else {
5719     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5720   }
5721 #endif
5722   __kmp_unlock_suspend_mx(this_th);
5723 
5724   TCW_4(__kmp_nth, __kmp_nth - 1);
5725 
5726 #ifdef KMP_ADJUST_BLOCKTIME
5727   /* Adjust blocktime back to user setting or default if necessary */
5728   /* Middle initialization might never have occurred                */
5729   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5730     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5731     if (__kmp_nth <= __kmp_avail_proc) {
5732       __kmp_zero_bt = FALSE;
5733     }
5734   }
5735 #endif /* KMP_ADJUST_BLOCKTIME */
5736 
5737   KMP_MB();
5738 }
5739 
5740 /* ------------------------------------------------------------------------ */
5741 
5742 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5743 #if OMPTARGET_PROFILING_SUPPORT
5744   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5745   // TODO: add a configuration option for time granularity
5746   if (ProfileTraceFile)
5747     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5748 #endif
5749 
5750   int gtid = this_thr->th.th_info.ds.ds_gtid;
5751   /*    void                 *stack_data;*/
5752   kmp_team_t **volatile pteam;
5753 
5754   KMP_MB();
5755   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5756 
5757   if (__kmp_env_consistency_check) {
5758     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5759   }
5760 
5761 #if OMPT_SUPPORT
5762   ompt_data_t *thread_data;
5763   if (ompt_enabled.enabled) {
5764     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5765     *thread_data = ompt_data_none;
5766 
5767     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5768     this_thr->th.ompt_thread_info.wait_id = 0;
5769     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5770     this_thr->th.ompt_thread_info.parallel_flags = 0;
5771     if (ompt_enabled.ompt_callback_thread_begin) {
5772       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5773           ompt_thread_worker, thread_data);
5774     }
5775     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5776   }
5777 #endif
5778 
5779   /* This is the place where threads wait for work */
5780   while (!TCR_4(__kmp_global.g.g_done)) {
5781     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5782     KMP_MB();
5783 
5784     /* wait for work to do */
5785     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5786 
5787     /* No tid yet since not part of a team */
5788     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5789 
5790 #if OMPT_SUPPORT
5791     if (ompt_enabled.enabled) {
5792       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5793     }
5794 #endif
5795 
5796     pteam = &this_thr->th.th_team;
5797 
5798     /* have we been allocated? */
5799     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5800       /* we were just woken up, so run our new task */
5801       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5802         int rc;
5803         KA_TRACE(20,
5804                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5805                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5806                   (*pteam)->t.t_pkfn));
5807 
5808         updateHWFPControl(*pteam);
5809 
5810 #if OMPT_SUPPORT
5811         if (ompt_enabled.enabled) {
5812           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5813         }
5814 #endif
5815 
5816         rc = (*pteam)->t.t_invoke(gtid);
5817         KMP_ASSERT(rc);
5818 
5819         KMP_MB();
5820         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5821                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5822                       (*pteam)->t.t_pkfn));
5823       }
5824 #if OMPT_SUPPORT
5825       if (ompt_enabled.enabled) {
5826         /* no frame set while outside task */
5827         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5828 
5829         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5830       }
5831 #endif
5832       /* join barrier after parallel region */
5833       __kmp_join_barrier(gtid);
5834     }
5835   }
5836   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5837 
5838 #if OMPT_SUPPORT
5839   if (ompt_enabled.ompt_callback_thread_end) {
5840     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5841   }
5842 #endif
5843 
5844   this_thr->th.th_task_team = NULL;
5845   /* run the destructors for the threadprivate data for this thread */
5846   __kmp_common_destroy_gtid(gtid);
5847 
5848   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5849   KMP_MB();
5850 
5851 #if OMPTARGET_PROFILING_SUPPORT
5852   llvm::timeTraceProfilerFinishThread();
5853 #endif
5854   return this_thr;
5855 }
5856 
5857 /* ------------------------------------------------------------------------ */
5858 
5859 void __kmp_internal_end_dest(void *specific_gtid) {
5860   // Make sure no significant bits are lost
5861   int gtid;
5862   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5863 
5864   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5865   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5866    * this is because 0 is reserved for the nothing-stored case */
5867 
5868   __kmp_internal_end_thread(gtid);
5869 }
5870 
5871 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5872 
5873 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5874   __kmp_internal_end_atexit();
5875 }
5876 
5877 #endif
5878 
5879 /* [Windows] josh: when the atexit handler is called, there may still be more
5880    than one thread alive */
5881 void __kmp_internal_end_atexit(void) {
5882   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5883   /* [Windows]
5884      josh: ideally, we want to completely shutdown the library in this atexit
5885      handler, but stat code that depends on thread specific data for gtid fails
5886      because that data becomes unavailable at some point during the shutdown, so
5887      we call __kmp_internal_end_thread instead. We should eventually remove the
5888      dependency on __kmp_get_specific_gtid in the stat code and use
5889      __kmp_internal_end_library to cleanly shutdown the library.
5890 
5891      // TODO: Can some of this comment about GVS be removed?
5892      I suspect that the offending stat code is executed when the calling thread
5893      tries to clean up a dead root thread's data structures, resulting in GVS
5894      code trying to close the GVS structures for that thread, but since the stat
5895      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5896      the calling thread is cleaning up itself instead of another thread, it get
5897      confused. This happens because allowing a thread to unregister and cleanup
5898      another thread is a recent modification for addressing an issue.
5899      Based on the current design (20050722), a thread may end up
5900      trying to unregister another thread only if thread death does not trigger
5901      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5902      thread specific data destructor function to detect thread death. For
5903      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5904      is nothing.  Thus, the workaround is applicable only for Windows static
5905      stat library. */
5906   __kmp_internal_end_library(-1);
5907 #if KMP_OS_WINDOWS
5908   __kmp_close_console();
5909 #endif
5910 }
5911 
5912 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5913   // It is assumed __kmp_forkjoin_lock is acquired.
5914 
5915   int gtid;
5916 
5917   KMP_DEBUG_ASSERT(thread != NULL);
5918 
5919   gtid = thread->th.th_info.ds.ds_gtid;
5920 
5921   if (!is_root) {
5922     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5923       /* Assume the threads are at the fork barrier here */
5924       KA_TRACE(
5925           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5926                gtid));
5927       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5928        * (GEH) */
5929       ANNOTATE_HAPPENS_BEFORE(thread);
5930       kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
5931                          thread);
5932       __kmp_release_64(&flag);
5933     }
5934 
5935     // Terminate OS thread.
5936     __kmp_reap_worker(thread);
5937 
5938     // The thread was killed asynchronously.  If it was actively
5939     // spinning in the thread pool, decrement the global count.
5940     //
5941     // There is a small timing hole here - if the worker thread was just waking
5942     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5943     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5944     // the global counter might not get updated.
5945     //
5946     // Currently, this can only happen as the library is unloaded,
5947     // so there are no harmful side effects.
5948     if (thread->th.th_active_in_pool) {
5949       thread->th.th_active_in_pool = FALSE;
5950       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5951       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5952     }
5953   }
5954 
5955   __kmp_free_implicit_task(thread);
5956 
5957 // Free the fast memory for tasking
5958 #if USE_FAST_MEMORY
5959   __kmp_free_fast_memory(thread);
5960 #endif /* USE_FAST_MEMORY */
5961 
5962   __kmp_suspend_uninitialize_thread(thread);
5963 
5964   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5965   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5966 
5967   --__kmp_all_nth;
5968 // __kmp_nth was decremented when thread is added to the pool.
5969 
5970 #ifdef KMP_ADJUST_BLOCKTIME
5971   /* Adjust blocktime back to user setting or default if necessary */
5972   /* Middle initialization might never have occurred                */
5973   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5974     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5975     if (__kmp_nth <= __kmp_avail_proc) {
5976       __kmp_zero_bt = FALSE;
5977     }
5978   }
5979 #endif /* KMP_ADJUST_BLOCKTIME */
5980 
5981   /* free the memory being used */
5982   if (__kmp_env_consistency_check) {
5983     if (thread->th.th_cons) {
5984       __kmp_free_cons_stack(thread->th.th_cons);
5985       thread->th.th_cons = NULL;
5986     }
5987   }
5988 
5989   if (thread->th.th_pri_common != NULL) {
5990     __kmp_free(thread->th.th_pri_common);
5991     thread->th.th_pri_common = NULL;
5992   }
5993 
5994   if (thread->th.th_task_state_memo_stack != NULL) {
5995     __kmp_free(thread->th.th_task_state_memo_stack);
5996     thread->th.th_task_state_memo_stack = NULL;
5997   }
5998 
5999 #if KMP_USE_BGET
6000   if (thread->th.th_local.bget_data != NULL) {
6001     __kmp_finalize_bget(thread);
6002   }
6003 #endif
6004 
6005 #if KMP_AFFINITY_SUPPORTED
6006   if (thread->th.th_affin_mask != NULL) {
6007     KMP_CPU_FREE(thread->th.th_affin_mask);
6008     thread->th.th_affin_mask = NULL;
6009   }
6010 #endif /* KMP_AFFINITY_SUPPORTED */
6011 
6012 #if KMP_USE_HIER_SCHED
6013   if (thread->th.th_hier_bar_data != NULL) {
6014     __kmp_free(thread->th.th_hier_bar_data);
6015     thread->th.th_hier_bar_data = NULL;
6016   }
6017 #endif
6018 
6019   __kmp_reap_team(thread->th.th_serial_team);
6020   thread->th.th_serial_team = NULL;
6021   __kmp_free(thread);
6022 
6023   KMP_MB();
6024 
6025 } // __kmp_reap_thread
6026 
6027 static void __kmp_internal_end(void) {
6028   int i;
6029 
6030   /* First, unregister the library */
6031   __kmp_unregister_library();
6032 
6033 #if KMP_OS_WINDOWS
6034   /* In Win static library, we can't tell when a root actually dies, so we
6035      reclaim the data structures for any root threads that have died but not
6036      unregistered themselves, in order to shut down cleanly.
6037      In Win dynamic library we also can't tell when a thread dies.  */
6038   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6039 // dead roots
6040 #endif
6041 
6042   for (i = 0; i < __kmp_threads_capacity; i++)
6043     if (__kmp_root[i])
6044       if (__kmp_root[i]->r.r_active)
6045         break;
6046   KMP_MB(); /* Flush all pending memory write invalidates.  */
6047   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6048 
6049   if (i < __kmp_threads_capacity) {
6050 #if KMP_USE_MONITOR
6051     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6052     KMP_MB(); /* Flush all pending memory write invalidates.  */
6053 
6054     // Need to check that monitor was initialized before reaping it. If we are
6055     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6056     // __kmp_monitor will appear to contain valid data, but it is only valid in
6057     // the parent process, not the child.
6058     // New behavior (201008): instead of keying off of the flag
6059     // __kmp_init_parallel, the monitor thread creation is keyed off
6060     // of the new flag __kmp_init_monitor.
6061     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6062     if (TCR_4(__kmp_init_monitor)) {
6063       __kmp_reap_monitor(&__kmp_monitor);
6064       TCW_4(__kmp_init_monitor, 0);
6065     }
6066     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6067     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6068 #endif // KMP_USE_MONITOR
6069   } else {
6070 /* TODO move this to cleanup code */
6071 #ifdef KMP_DEBUG
6072     /* make sure that everything has properly ended */
6073     for (i = 0; i < __kmp_threads_capacity; i++) {
6074       if (__kmp_root[i]) {
6075         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6076         //                    there can be uber threads alive here
6077         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6078       }
6079     }
6080 #endif
6081 
6082     KMP_MB();
6083 
6084     // Reap the worker threads.
6085     // This is valid for now, but be careful if threads are reaped sooner.
6086     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6087       // Get the next thread from the pool.
6088       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6089       __kmp_thread_pool = thread->th.th_next_pool;
6090       // Reap it.
6091       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6092       thread->th.th_next_pool = NULL;
6093       thread->th.th_in_pool = FALSE;
6094       __kmp_reap_thread(thread, 0);
6095     }
6096     __kmp_thread_pool_insert_pt = NULL;
6097 
6098     // Reap teams.
6099     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6100       // Get the next team from the pool.
6101       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6102       __kmp_team_pool = team->t.t_next_pool;
6103       // Reap it.
6104       team->t.t_next_pool = NULL;
6105       __kmp_reap_team(team);
6106     }
6107 
6108     __kmp_reap_task_teams();
6109 
6110 #if KMP_OS_UNIX
6111     // Threads that are not reaped should not access any resources since they
6112     // are going to be deallocated soon, so the shutdown sequence should wait
6113     // until all threads either exit the final spin-waiting loop or begin
6114     // sleeping after the given blocktime.
6115     for (i = 0; i < __kmp_threads_capacity; i++) {
6116       kmp_info_t *thr = __kmp_threads[i];
6117       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6118         KMP_CPU_PAUSE();
6119     }
6120 #endif
6121 
6122     for (i = 0; i < __kmp_threads_capacity; ++i) {
6123       // TBD: Add some checking...
6124       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6125     }
6126 
6127     /* Make sure all threadprivate destructors get run by joining with all
6128        worker threads before resetting this flag */
6129     TCW_SYNC_4(__kmp_init_common, FALSE);
6130 
6131     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6132     KMP_MB();
6133 
6134 #if KMP_USE_MONITOR
6135     // See note above: One of the possible fixes for CQ138434 / CQ140126
6136     //
6137     // FIXME: push both code fragments down and CSE them?
6138     // push them into __kmp_cleanup() ?
6139     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6140     if (TCR_4(__kmp_init_monitor)) {
6141       __kmp_reap_monitor(&__kmp_monitor);
6142       TCW_4(__kmp_init_monitor, 0);
6143     }
6144     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6145     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6146 #endif
6147   } /* else !__kmp_global.t_active */
6148   TCW_4(__kmp_init_gtid, FALSE);
6149   KMP_MB(); /* Flush all pending memory write invalidates.  */
6150 
6151   __kmp_cleanup();
6152 #if OMPT_SUPPORT
6153   ompt_fini();
6154 #endif
6155 }
6156 
6157 void __kmp_internal_end_library(int gtid_req) {
6158   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6159   /* this shouldn't be a race condition because __kmp_internal_end() is the
6160      only place to clear __kmp_serial_init */
6161   /* we'll check this later too, after we get the lock */
6162   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6163   // redundant, because the next check will work in any case.
6164   if (__kmp_global.g.g_abort) {
6165     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6166     /* TODO abort? */
6167     return;
6168   }
6169   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6170     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6171     return;
6172   }
6173 
6174   KMP_MB(); /* Flush all pending memory write invalidates.  */
6175   /* find out who we are and what we should do */
6176   {
6177     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6178     KA_TRACE(
6179         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6180     if (gtid == KMP_GTID_SHUTDOWN) {
6181       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6182                     "already shutdown\n"));
6183       return;
6184     } else if (gtid == KMP_GTID_MONITOR) {
6185       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6186                     "registered, or system shutdown\n"));
6187       return;
6188     } else if (gtid == KMP_GTID_DNE) {
6189       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6190                     "shutdown\n"));
6191       /* we don't know who we are, but we may still shutdown the library */
6192     } else if (KMP_UBER_GTID(gtid)) {
6193       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6194       if (__kmp_root[gtid]->r.r_active) {
6195         __kmp_global.g.g_abort = -1;
6196         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6197         __kmp_unregister_library();
6198         KA_TRACE(10,
6199                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6200                   gtid));
6201         return;
6202       } else {
6203         KA_TRACE(
6204             10,
6205             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6206         __kmp_unregister_root_current_thread(gtid);
6207       }
6208     } else {
6209 /* worker threads may call this function through the atexit handler, if they
6210  * call exit() */
6211 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6212    TODO: do a thorough shutdown instead */
6213 #ifdef DUMP_DEBUG_ON_EXIT
6214       if (__kmp_debug_buf)
6215         __kmp_dump_debug_buffer();
6216 #endif
6217       // added unregister library call here when we switch to shm linux
6218       // if we don't, it will leave lots of files in /dev/shm
6219       // cleanup shared memory file before exiting.
6220       __kmp_unregister_library();
6221       return;
6222     }
6223   }
6224   /* synchronize the termination process */
6225   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6226 
6227   /* have we already finished */
6228   if (__kmp_global.g.g_abort) {
6229     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6230     /* TODO abort? */
6231     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6232     return;
6233   }
6234   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6235     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6236     return;
6237   }
6238 
6239   /* We need this lock to enforce mutex between this reading of
6240      __kmp_threads_capacity and the writing by __kmp_register_root.
6241      Alternatively, we can use a counter of roots that is atomically updated by
6242      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6243      __kmp_internal_end_*.  */
6244   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6245 
6246   /* now we can safely conduct the actual termination */
6247   __kmp_internal_end();
6248 
6249   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6250   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6251 
6252   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6253 
6254 #ifdef DUMP_DEBUG_ON_EXIT
6255   if (__kmp_debug_buf)
6256     __kmp_dump_debug_buffer();
6257 #endif
6258 
6259 #if KMP_OS_WINDOWS
6260   __kmp_close_console();
6261 #endif
6262 
6263   __kmp_fini_allocator();
6264 
6265 } // __kmp_internal_end_library
6266 
6267 void __kmp_internal_end_thread(int gtid_req) {
6268   int i;
6269 
6270   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6271   /* this shouldn't be a race condition because __kmp_internal_end() is the
6272    * only place to clear __kmp_serial_init */
6273   /* we'll check this later too, after we get the lock */
6274   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6275   // redundant, because the next check will work in any case.
6276   if (__kmp_global.g.g_abort) {
6277     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6278     /* TODO abort? */
6279     return;
6280   }
6281   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6282     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6283     return;
6284   }
6285 
6286   // If hidden helper team has been initialized, we need to deinit it
6287   if (TCR_4(__kmp_init_hidden_helper)) {
6288     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6289     // First release the main thread to let it continue its work
6290     __kmp_hidden_helper_main_thread_release();
6291     // Wait until the hidden helper team has been destroyed
6292     __kmp_hidden_helper_threads_deinitz_wait();
6293   }
6294 
6295   KMP_MB(); /* Flush all pending memory write invalidates.  */
6296 
6297   /* find out who we are and what we should do */
6298   {
6299     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6300     KA_TRACE(10,
6301              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6302     if (gtid == KMP_GTID_SHUTDOWN) {
6303       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6304                     "already shutdown\n"));
6305       return;
6306     } else if (gtid == KMP_GTID_MONITOR) {
6307       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6308                     "registered, or system shutdown\n"));
6309       return;
6310     } else if (gtid == KMP_GTID_DNE) {
6311       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6312                     "shutdown\n"));
6313       return;
6314       /* we don't know who we are */
6315     } else if (KMP_UBER_GTID(gtid)) {
6316       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6317       if (__kmp_root[gtid]->r.r_active) {
6318         __kmp_global.g.g_abort = -1;
6319         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6320         KA_TRACE(10,
6321                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6322                   gtid));
6323         return;
6324       } else {
6325         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6326                       gtid));
6327         __kmp_unregister_root_current_thread(gtid);
6328       }
6329     } else {
6330       /* just a worker thread, let's leave */
6331       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6332 
6333       if (gtid >= 0) {
6334         __kmp_threads[gtid]->th.th_task_team = NULL;
6335       }
6336 
6337       KA_TRACE(10,
6338                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6339                 gtid));
6340       return;
6341     }
6342   }
6343 #if KMP_DYNAMIC_LIB
6344   if (__kmp_pause_status != kmp_hard_paused)
6345   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6346   // because we will better shutdown later in the library destructor.
6347   {
6348     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6349     return;
6350   }
6351 #endif
6352   /* synchronize the termination process */
6353   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6354 
6355   /* have we already finished */
6356   if (__kmp_global.g.g_abort) {
6357     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6358     /* TODO abort? */
6359     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6360     return;
6361   }
6362   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6363     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6364     return;
6365   }
6366 
6367   /* We need this lock to enforce mutex between this reading of
6368      __kmp_threads_capacity and the writing by __kmp_register_root.
6369      Alternatively, we can use a counter of roots that is atomically updated by
6370      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6371      __kmp_internal_end_*.  */
6372 
6373   /* should we finish the run-time?  are all siblings done? */
6374   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6375 
6376   for (i = 0; i < __kmp_threads_capacity; ++i) {
6377     if (KMP_UBER_GTID(i)) {
6378       KA_TRACE(
6379           10,
6380           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6381       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6382       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6383       return;
6384     }
6385   }
6386 
6387   /* now we can safely conduct the actual termination */
6388 
6389   __kmp_internal_end();
6390 
6391   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6392   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6393 
6394   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6395 
6396 #ifdef DUMP_DEBUG_ON_EXIT
6397   if (__kmp_debug_buf)
6398     __kmp_dump_debug_buffer();
6399 #endif
6400 } // __kmp_internal_end_thread
6401 
6402 // -----------------------------------------------------------------------------
6403 // Library registration stuff.
6404 
6405 static long __kmp_registration_flag = 0;
6406 // Random value used to indicate library initialization.
6407 static char *__kmp_registration_str = NULL;
6408 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6409 
6410 static inline char *__kmp_reg_status_name() {
6411 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6412    each thread. If registration and unregistration go in different threads
6413    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6414    env var can not be found, because the name will contain different pid. */
6415 // macOS* complains about name being too long with additional getuid()
6416 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6417   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6418                           (int)getuid());
6419 #else
6420   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6421 #endif
6422 } // __kmp_reg_status_get
6423 
6424 void __kmp_register_library_startup(void) {
6425 
6426   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6427   int done = 0;
6428   union {
6429     double dtime;
6430     long ltime;
6431   } time;
6432 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6433   __kmp_initialize_system_tick();
6434 #endif
6435   __kmp_read_system_time(&time.dtime);
6436   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6437   __kmp_registration_str =
6438       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6439                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6440 
6441   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6442                 __kmp_registration_str));
6443 
6444   while (!done) {
6445 
6446     char *value = NULL; // Actual value of the environment variable.
6447 
6448 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6449     char *shm_name = __kmp_str_format("/%s", name);
6450     int shm_preexist = 0;
6451     char *data1;
6452     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6453     if ((fd1 == -1) && (errno == EEXIST)) {
6454       // file didn't open because it already exists.
6455       // try opening existing file
6456       fd1 = shm_open(shm_name, O_RDWR, 0666);
6457       if (fd1 == -1) { // file didn't open
6458         // error out here
6459         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6460                     __kmp_msg_null);
6461       } else {
6462         // able to open existing file
6463         shm_preexist = 1;
6464       }
6465     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6466       // already exists.
6467       // error out here.
6468       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6469                   __kmp_msg_null);
6470     }
6471     if (shm_preexist == 0) {
6472       // we created SHM now set size
6473       if (ftruncate(fd1, SHM_SIZE) == -1) {
6474         // error occured setting size;
6475         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6476                     KMP_ERR(errno), __kmp_msg_null);
6477       }
6478     }
6479     data1 =
6480         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6481     if (data1 == MAP_FAILED) {
6482       // failed to map shared memory
6483       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6484                   __kmp_msg_null);
6485     }
6486     if (shm_preexist == 0) { // set data to SHM, set value
6487       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6488     }
6489     // Read value from either what we just wrote or existing file.
6490     value = __kmp_str_format("%s", data1); // read value from SHM
6491     munmap(data1, SHM_SIZE);
6492     close(fd1);
6493 #else // Windows and unix with static library
6494     // Set environment variable, but do not overwrite if it is exist.
6495     __kmp_env_set(name, __kmp_registration_str, 0);
6496     // read value to see if it got set
6497     value = __kmp_env_get(name);
6498 #endif
6499 
6500     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6501       done = 1; // Ok, environment variable set successfully, exit the loop.
6502     } else {
6503       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6504       // Check whether it alive or dead.
6505       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6506       char *tail = value;
6507       char *flag_addr_str = NULL;
6508       char *flag_val_str = NULL;
6509       char const *file_name = NULL;
6510       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6511       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6512       file_name = tail;
6513       if (tail != NULL) {
6514         long *flag_addr = 0;
6515         long flag_val = 0;
6516         KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6517         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6518         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6519           // First, check whether environment-encoded address is mapped into
6520           // addr space.
6521           // If so, dereference it to see if it still has the right value.
6522           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6523             neighbor = 1;
6524           } else {
6525             // If not, then we know the other copy of the library is no longer
6526             // running.
6527             neighbor = 2;
6528           }
6529         }
6530       }
6531       switch (neighbor) {
6532       case 0: // Cannot parse environment variable -- neighbor status unknown.
6533         // Assume it is the incompatible format of future version of the
6534         // library. Assume the other library is alive.
6535         // WARN( ... ); // TODO: Issue a warning.
6536         file_name = "unknown library";
6537         KMP_FALLTHROUGH();
6538       // Attention! Falling to the next case. That's intentional.
6539       case 1: { // Neighbor is alive.
6540         // Check it is allowed.
6541         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6542         if (!__kmp_str_match_true(duplicate_ok)) {
6543           // That's not allowed. Issue fatal error.
6544           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6545                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6546         }
6547         KMP_INTERNAL_FREE(duplicate_ok);
6548         __kmp_duplicate_library_ok = 1;
6549         done = 1; // Exit the loop.
6550       } break;
6551       case 2: { // Neighbor is dead.
6552 
6553 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6554         // close shared memory.
6555         shm_unlink(shm_name); // this removes file in /dev/shm
6556 #else
6557         // Clear the variable and try to register library again.
6558         __kmp_env_unset(name);
6559 #endif
6560       } break;
6561       default: { KMP_DEBUG_ASSERT(0); } break;
6562       }
6563     }
6564     KMP_INTERNAL_FREE((void *)value);
6565 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6566     KMP_INTERNAL_FREE((void *)shm_name);
6567 #endif
6568   } // while
6569   KMP_INTERNAL_FREE((void *)name);
6570 
6571 } // func __kmp_register_library_startup
6572 
6573 void __kmp_unregister_library(void) {
6574 
6575   char *name = __kmp_reg_status_name();
6576   char *value = NULL;
6577 
6578 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6579   char *shm_name = __kmp_str_format("/%s", name);
6580   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6581   if (fd1 == -1) {
6582     // file did not open. return.
6583     return;
6584   }
6585   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6586   if (data1 != MAP_FAILED) {
6587     value = __kmp_str_format("%s", data1); // read value from SHM
6588     munmap(data1, SHM_SIZE);
6589   }
6590   close(fd1);
6591 #else
6592   value = __kmp_env_get(name);
6593 #endif
6594 
6595   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6596   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6597   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6598 //  Ok, this is our variable. Delete it.
6599 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6600     shm_unlink(shm_name); // this removes file in /dev/shm
6601 #else
6602     __kmp_env_unset(name);
6603 #endif
6604   }
6605 
6606 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6607   KMP_INTERNAL_FREE(shm_name);
6608 #endif
6609 
6610   KMP_INTERNAL_FREE(__kmp_registration_str);
6611   KMP_INTERNAL_FREE(value);
6612   KMP_INTERNAL_FREE(name);
6613 
6614   __kmp_registration_flag = 0;
6615   __kmp_registration_str = NULL;
6616 
6617 } // __kmp_unregister_library
6618 
6619 // End of Library registration stuff.
6620 // -----------------------------------------------------------------------------
6621 
6622 #if KMP_MIC_SUPPORTED
6623 
6624 static void __kmp_check_mic_type() {
6625   kmp_cpuid_t cpuid_state = {0};
6626   kmp_cpuid_t *cs_p = &cpuid_state;
6627   __kmp_x86_cpuid(1, 0, cs_p);
6628   // We don't support mic1 at the moment
6629   if ((cs_p->eax & 0xff0) == 0xB10) {
6630     __kmp_mic_type = mic2;
6631   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6632     __kmp_mic_type = mic3;
6633   } else {
6634     __kmp_mic_type = non_mic;
6635   }
6636 }
6637 
6638 #endif /* KMP_MIC_SUPPORTED */
6639 
6640 #if KMP_HAVE_UMWAIT
6641 static void __kmp_user_level_mwait_init() {
6642   struct kmp_cpuid buf;
6643   __kmp_x86_cpuid(7, 0, &buf);
6644   __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6645   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6646                 __kmp_umwait_enabled));
6647 }
6648 #elif KMP_HAVE_MWAIT
6649 #ifndef AT_INTELPHIUSERMWAIT
6650 // Spurious, non-existent value that should always fail to return anything.
6651 // Will be replaced with the correct value when we know that.
6652 #define AT_INTELPHIUSERMWAIT 10000
6653 #endif
6654 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6655 // earlier OS is used to build the RTL, we'll use the following internal
6656 // function when the entry is not found.
6657 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6658 unsigned long getauxval(unsigned long) { return 0; }
6659 
6660 static void __kmp_user_level_mwait_init() {
6661   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6662   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6663   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6664   // KMP_USER_LEVEL_MWAIT was set to TRUE.
6665   if (__kmp_mic_type == mic3) {
6666     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6667     if ((res & 0x1) || __kmp_user_level_mwait) {
6668       __kmp_mwait_enabled = TRUE;
6669       if (__kmp_user_level_mwait) {
6670         KMP_INFORM(EnvMwaitWarn);
6671       }
6672     } else {
6673       __kmp_mwait_enabled = FALSE;
6674     }
6675   }
6676   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6677                 "__kmp_mwait_enabled = %d\n",
6678                 __kmp_mic_type, __kmp_mwait_enabled));
6679 }
6680 #endif /* KMP_HAVE_UMWAIT */
6681 
6682 static void __kmp_do_serial_initialize(void) {
6683   int i, gtid;
6684   size_t size;
6685 
6686   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6687 
6688   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6689   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6690   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6691   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6692   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6693 
6694 #if OMPT_SUPPORT
6695   ompt_pre_init();
6696 #endif
6697 
6698   __kmp_validate_locks();
6699 
6700   /* Initialize internal memory allocator */
6701   __kmp_init_allocator();
6702 
6703   /* Register the library startup via an environment variable and check to see
6704      whether another copy of the library is already registered. */
6705 
6706   __kmp_register_library_startup();
6707 
6708   /* TODO reinitialization of library */
6709   if (TCR_4(__kmp_global.g.g_done)) {
6710     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6711   }
6712 
6713   __kmp_global.g.g_abort = 0;
6714   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6715 
6716 /* initialize the locks */
6717 #if KMP_USE_ADAPTIVE_LOCKS
6718 #if KMP_DEBUG_ADAPTIVE_LOCKS
6719   __kmp_init_speculative_stats();
6720 #endif
6721 #endif
6722 #if KMP_STATS_ENABLED
6723   __kmp_stats_init();
6724 #endif
6725   __kmp_init_lock(&__kmp_global_lock);
6726   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6727   __kmp_init_lock(&__kmp_debug_lock);
6728   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6729   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6730   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6731   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6732   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6733   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6734   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6735   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6736   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6737   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6738   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6739   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6740   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6741   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6742   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6743 #if KMP_USE_MONITOR
6744   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6745 #endif
6746   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6747 
6748   /* conduct initialization and initial setup of configuration */
6749 
6750   __kmp_runtime_initialize();
6751 
6752 #if KMP_MIC_SUPPORTED
6753   __kmp_check_mic_type();
6754 #endif
6755 
6756 // Some global variable initialization moved here from kmp_env_initialize()
6757 #ifdef KMP_DEBUG
6758   kmp_diag = 0;
6759 #endif
6760   __kmp_abort_delay = 0;
6761 
6762   // From __kmp_init_dflt_team_nth()
6763   /* assume the entire machine will be used */
6764   __kmp_dflt_team_nth_ub = __kmp_xproc;
6765   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6766     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6767   }
6768   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6769     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6770   }
6771   __kmp_max_nth = __kmp_sys_max_nth;
6772   __kmp_cg_max_nth = __kmp_sys_max_nth;
6773   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6774   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6775     __kmp_teams_max_nth = __kmp_sys_max_nth;
6776   }
6777 
6778   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6779   // part
6780   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6781 #if KMP_USE_MONITOR
6782   __kmp_monitor_wakeups =
6783       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6784   __kmp_bt_intervals =
6785       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6786 #endif
6787   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6788   __kmp_library = library_throughput;
6789   // From KMP_SCHEDULE initialization
6790   __kmp_static = kmp_sch_static_balanced;
6791 // AC: do not use analytical here, because it is non-monotonous
6792 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6793 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6794 // need to repeat assignment
6795 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6796 // bit control and barrier method control parts
6797 #if KMP_FAST_REDUCTION_BARRIER
6798 #define kmp_reduction_barrier_gather_bb ((int)1)
6799 #define kmp_reduction_barrier_release_bb ((int)1)
6800 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6801 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6802 #endif // KMP_FAST_REDUCTION_BARRIER
6803   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6804     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6805     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6806     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6807     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6808 #if KMP_FAST_REDUCTION_BARRIER
6809     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6810       // lin_64 ): hyper,1
6811       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6812       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6813       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6814       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6815     }
6816 #endif // KMP_FAST_REDUCTION_BARRIER
6817   }
6818 #if KMP_FAST_REDUCTION_BARRIER
6819 #undef kmp_reduction_barrier_release_pat
6820 #undef kmp_reduction_barrier_gather_pat
6821 #undef kmp_reduction_barrier_release_bb
6822 #undef kmp_reduction_barrier_gather_bb
6823 #endif // KMP_FAST_REDUCTION_BARRIER
6824 #if KMP_MIC_SUPPORTED
6825   if (__kmp_mic_type == mic2) { // KNC
6826     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6827     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6828     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6829         1; // forkjoin release
6830     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6831     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6832   }
6833 #if KMP_FAST_REDUCTION_BARRIER
6834   if (__kmp_mic_type == mic2) { // KNC
6835     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6836     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6837   }
6838 #endif // KMP_FAST_REDUCTION_BARRIER
6839 #endif // KMP_MIC_SUPPORTED
6840 
6841 // From KMP_CHECKS initialization
6842 #ifdef KMP_DEBUG
6843   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6844 #else
6845   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6846 #endif
6847 
6848   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6849   __kmp_foreign_tp = TRUE;
6850 
6851   __kmp_global.g.g_dynamic = FALSE;
6852   __kmp_global.g.g_dynamic_mode = dynamic_default;
6853 
6854   __kmp_env_initialize(NULL);
6855 
6856 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
6857   __kmp_user_level_mwait_init();
6858 #endif
6859 // Print all messages in message catalog for testing purposes.
6860 #ifdef KMP_DEBUG
6861   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6862   if (__kmp_str_match_true(val)) {
6863     kmp_str_buf_t buffer;
6864     __kmp_str_buf_init(&buffer);
6865     __kmp_i18n_dump_catalog(&buffer);
6866     __kmp_printf("%s", buffer.str);
6867     __kmp_str_buf_free(&buffer);
6868   }
6869   __kmp_env_free(&val);
6870 #endif
6871 
6872   __kmp_threads_capacity =
6873       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6874   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6875   __kmp_tp_capacity = __kmp_default_tp_capacity(
6876       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6877 
6878   // If the library is shut down properly, both pools must be NULL. Just in
6879   // case, set them to NULL -- some memory may leak, but subsequent code will
6880   // work even if pools are not freed.
6881   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6882   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6883   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6884   __kmp_thread_pool = NULL;
6885   __kmp_thread_pool_insert_pt = NULL;
6886   __kmp_team_pool = NULL;
6887 
6888   /* Allocate all of the variable sized records */
6889   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6890    * expandable */
6891   /* Since allocation is cache-aligned, just add extra padding at the end */
6892   size =
6893       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6894       CACHE_LINE;
6895   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6896   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6897                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6898 
6899   /* init thread counts */
6900   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6901                    0); // Asserts fail if the library is reinitializing and
6902   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6903   __kmp_all_nth = 0;
6904   __kmp_nth = 0;
6905 
6906   /* setup the uber master thread and hierarchy */
6907   gtid = __kmp_register_root(TRUE);
6908   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6909   KMP_ASSERT(KMP_UBER_GTID(gtid));
6910   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6911 
6912   KMP_MB(); /* Flush all pending memory write invalidates.  */
6913 
6914   __kmp_common_initialize();
6915 
6916 #if KMP_OS_UNIX
6917   /* invoke the child fork handler */
6918   __kmp_register_atfork();
6919 #endif
6920 
6921 #if !KMP_DYNAMIC_LIB
6922   {
6923     /* Invoke the exit handler when the program finishes, only for static
6924        library. For dynamic library, we already have _fini and DllMain. */
6925     int rc = atexit(__kmp_internal_end_atexit);
6926     if (rc != 0) {
6927       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6928                   __kmp_msg_null);
6929     }
6930   }
6931 #endif
6932 
6933 #if KMP_HANDLE_SIGNALS
6934 #if KMP_OS_UNIX
6935   /* NOTE: make sure that this is called before the user installs their own
6936      signal handlers so that the user handlers are called first. this way they
6937      can return false, not call our handler, avoid terminating the library, and
6938      continue execution where they left off. */
6939   __kmp_install_signals(FALSE);
6940 #endif /* KMP_OS_UNIX */
6941 #if KMP_OS_WINDOWS
6942   __kmp_install_signals(TRUE);
6943 #endif /* KMP_OS_WINDOWS */
6944 #endif
6945 
6946   /* we have finished the serial initialization */
6947   __kmp_init_counter++;
6948 
6949   __kmp_init_serial = TRUE;
6950 
6951   if (__kmp_settings) {
6952     __kmp_env_print();
6953   }
6954 
6955   if (__kmp_display_env || __kmp_display_env_verbose) {
6956     __kmp_env_print_2();
6957   }
6958 
6959 #if OMPT_SUPPORT
6960   ompt_post_init();
6961 #endif
6962 
6963   KMP_MB();
6964 
6965   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6966 }
6967 
6968 void __kmp_serial_initialize(void) {
6969   if (__kmp_init_serial) {
6970     return;
6971   }
6972   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6973   if (__kmp_init_serial) {
6974     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6975     return;
6976   }
6977   __kmp_do_serial_initialize();
6978   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6979 }
6980 
6981 static void __kmp_do_middle_initialize(void) {
6982   int i, j;
6983   int prev_dflt_team_nth;
6984 
6985   if (!__kmp_init_serial) {
6986     __kmp_do_serial_initialize();
6987   }
6988 
6989   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6990 
6991   // Save the previous value for the __kmp_dflt_team_nth so that
6992   // we can avoid some reinitialization if it hasn't changed.
6993   prev_dflt_team_nth = __kmp_dflt_team_nth;
6994 
6995 #if KMP_AFFINITY_SUPPORTED
6996   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6997   // number of cores on the machine.
6998   __kmp_affinity_initialize();
6999 
7000   // Run through the __kmp_threads array and set the affinity mask
7001   // for each root thread that is currently registered with the RTL.
7002   for (i = 0; i < __kmp_threads_capacity; i++) {
7003     if (TCR_PTR(__kmp_threads[i]) != NULL) {
7004       __kmp_affinity_set_init_mask(i, TRUE);
7005     }
7006   }
7007 #endif /* KMP_AFFINITY_SUPPORTED */
7008 
7009   KMP_ASSERT(__kmp_xproc > 0);
7010   if (__kmp_avail_proc == 0) {
7011     __kmp_avail_proc = __kmp_xproc;
7012   }
7013 
7014   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7015   // correct them now
7016   j = 0;
7017   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7018     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7019         __kmp_avail_proc;
7020     j++;
7021   }
7022 
7023   if (__kmp_dflt_team_nth == 0) {
7024 #ifdef KMP_DFLT_NTH_CORES
7025     // Default #threads = #cores
7026     __kmp_dflt_team_nth = __kmp_ncores;
7027     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7028                   "__kmp_ncores (%d)\n",
7029                   __kmp_dflt_team_nth));
7030 #else
7031     // Default #threads = #available OS procs
7032     __kmp_dflt_team_nth = __kmp_avail_proc;
7033     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7034                   "__kmp_avail_proc(%d)\n",
7035                   __kmp_dflt_team_nth));
7036 #endif /* KMP_DFLT_NTH_CORES */
7037   }
7038 
7039   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7040     __kmp_dflt_team_nth = KMP_MIN_NTH;
7041   }
7042   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7043     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7044   }
7045 
7046   // There's no harm in continuing if the following check fails,
7047   // but it indicates an error in the previous logic.
7048   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7049 
7050   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7051     // Run through the __kmp_threads array and set the num threads icv for each
7052     // root thread that is currently registered with the RTL (which has not
7053     // already explicitly set its nthreads-var with a call to
7054     // omp_set_num_threads()).
7055     for (i = 0; i < __kmp_threads_capacity; i++) {
7056       kmp_info_t *thread = __kmp_threads[i];
7057       if (thread == NULL)
7058         continue;
7059       if (thread->th.th_current_task->td_icvs.nproc != 0)
7060         continue;
7061 
7062       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7063     }
7064   }
7065   KA_TRACE(
7066       20,
7067       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7068        __kmp_dflt_team_nth));
7069 
7070 #ifdef KMP_ADJUST_BLOCKTIME
7071   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7072   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7073     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7074     if (__kmp_nth > __kmp_avail_proc) {
7075       __kmp_zero_bt = TRUE;
7076     }
7077   }
7078 #endif /* KMP_ADJUST_BLOCKTIME */
7079 
7080   /* we have finished middle initialization */
7081   TCW_SYNC_4(__kmp_init_middle, TRUE);
7082 
7083   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7084 }
7085 
7086 void __kmp_middle_initialize(void) {
7087   if (__kmp_init_middle) {
7088     return;
7089   }
7090   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7091   if (__kmp_init_middle) {
7092     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7093     return;
7094   }
7095   __kmp_do_middle_initialize();
7096   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7097 }
7098 
7099 void __kmp_parallel_initialize(void) {
7100   int gtid = __kmp_entry_gtid(); // this might be a new root
7101 
7102   /* synchronize parallel initialization (for sibling) */
7103   if (TCR_4(__kmp_init_parallel))
7104     return;
7105   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7106   if (TCR_4(__kmp_init_parallel)) {
7107     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7108     return;
7109   }
7110 
7111   /* TODO reinitialization after we have already shut down */
7112   if (TCR_4(__kmp_global.g.g_done)) {
7113     KA_TRACE(
7114         10,
7115         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7116     __kmp_infinite_loop();
7117   }
7118 
7119   /* jc: The lock __kmp_initz_lock is already held, so calling
7120      __kmp_serial_initialize would cause a deadlock.  So we call
7121      __kmp_do_serial_initialize directly. */
7122   if (!__kmp_init_middle) {
7123     __kmp_do_middle_initialize();
7124   }
7125   __kmp_resume_if_hard_paused();
7126 
7127   /* begin initialization */
7128   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7129   KMP_ASSERT(KMP_UBER_GTID(gtid));
7130 
7131 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7132   // Save the FP control regs.
7133   // Worker threads will set theirs to these values at thread startup.
7134   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7135   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7136   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7137 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7138 
7139 #if KMP_OS_UNIX
7140 #if KMP_HANDLE_SIGNALS
7141   /*  must be after __kmp_serial_initialize  */
7142   __kmp_install_signals(TRUE);
7143 #endif
7144 #endif
7145 
7146   __kmp_suspend_initialize();
7147 
7148 #if defined(USE_LOAD_BALANCE)
7149   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7150     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7151   }
7152 #else
7153   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7154     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7155   }
7156 #endif
7157 
7158   if (__kmp_version) {
7159     __kmp_print_version_2();
7160   }
7161 
7162   /* we have finished parallel initialization */
7163   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7164 
7165   KMP_MB();
7166   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7167 
7168   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7169 }
7170 
7171 void __kmp_hidden_helper_initialize() {
7172   if (TCR_4(__kmp_init_hidden_helper))
7173     return;
7174 
7175   // __kmp_parallel_initialize is required before we initialize hidden helper
7176   if (!TCR_4(__kmp_init_parallel))
7177     __kmp_parallel_initialize();
7178 
7179   // Double check. Note that this double check should not be placed before
7180   // __kmp_parallel_initialize as it will cause dead lock.
7181   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7182   if (TCR_4(__kmp_init_hidden_helper)) {
7183     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7184     return;
7185   }
7186 
7187   // Set the count of hidden helper tasks to be executed to zero
7188   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7189 
7190   // Set the global variable indicating that we're initializing hidden helper
7191   // team/threads
7192   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7193 
7194   // Platform independent initialization
7195   __kmp_do_initialize_hidden_helper_threads();
7196 
7197   // Wait here for the finish of initialization of hidden helper teams
7198   __kmp_hidden_helper_threads_initz_wait();
7199 
7200   // We have finished hidden helper initialization
7201   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7202 
7203   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7204 }
7205 
7206 /* ------------------------------------------------------------------------ */
7207 
7208 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7209                                    kmp_team_t *team) {
7210   kmp_disp_t *dispatch;
7211 
7212   KMP_MB();
7213 
7214   /* none of the threads have encountered any constructs, yet. */
7215   this_thr->th.th_local.this_construct = 0;
7216 #if KMP_CACHE_MANAGE
7217   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7218 #endif /* KMP_CACHE_MANAGE */
7219   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7220   KMP_DEBUG_ASSERT(dispatch);
7221   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7222   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7223   // this_thr->th.th_info.ds.ds_tid ] );
7224 
7225   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7226   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7227   if (__kmp_env_consistency_check)
7228     __kmp_push_parallel(gtid, team->t.t_ident);
7229 
7230   KMP_MB(); /* Flush all pending memory write invalidates.  */
7231 }
7232 
7233 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7234                                   kmp_team_t *team) {
7235   if (__kmp_env_consistency_check)
7236     __kmp_pop_parallel(gtid, team->t.t_ident);
7237 
7238   __kmp_finish_implicit_task(this_thr);
7239 }
7240 
7241 int __kmp_invoke_task_func(int gtid) {
7242   int rc;
7243   int tid = __kmp_tid_from_gtid(gtid);
7244   kmp_info_t *this_thr = __kmp_threads[gtid];
7245   kmp_team_t *team = this_thr->th.th_team;
7246 
7247   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7248 #if USE_ITT_BUILD
7249   if (__itt_stack_caller_create_ptr) {
7250     __kmp_itt_stack_callee_enter(
7251         (__itt_caller)
7252             team->t.t_stack_id); // inform ittnotify about entering user's code
7253   }
7254 #endif /* USE_ITT_BUILD */
7255 #if INCLUDE_SSC_MARKS
7256   SSC_MARK_INVOKING();
7257 #endif
7258 
7259 #if OMPT_SUPPORT
7260   void *dummy;
7261   void **exit_frame_p;
7262   ompt_data_t *my_task_data;
7263   ompt_data_t *my_parallel_data;
7264   int ompt_team_size;
7265 
7266   if (ompt_enabled.enabled) {
7267     exit_frame_p = &(
7268         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7269   } else {
7270     exit_frame_p = &dummy;
7271   }
7272 
7273   my_task_data =
7274       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7275   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7276   if (ompt_enabled.ompt_callback_implicit_task) {
7277     ompt_team_size = team->t.t_nproc;
7278     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7279         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7280         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7281     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7282   }
7283 #endif
7284 
7285 #if KMP_STATS_ENABLED
7286   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7287   if (previous_state == stats_state_e::TEAMS_REGION) {
7288     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7289   } else {
7290     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7291   }
7292   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7293 #endif
7294 
7295   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7296                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7297 #if OMPT_SUPPORT
7298                               ,
7299                               exit_frame_p
7300 #endif
7301                               );
7302 #if OMPT_SUPPORT
7303   *exit_frame_p = NULL;
7304    this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7305 #endif
7306 
7307 #if KMP_STATS_ENABLED
7308   if (previous_state == stats_state_e::TEAMS_REGION) {
7309     KMP_SET_THREAD_STATE(previous_state);
7310   }
7311   KMP_POP_PARTITIONED_TIMER();
7312 #endif
7313 
7314 #if USE_ITT_BUILD
7315   if (__itt_stack_caller_create_ptr) {
7316     __kmp_itt_stack_callee_leave(
7317         (__itt_caller)
7318             team->t.t_stack_id); // inform ittnotify about leaving user's code
7319   }
7320 #endif /* USE_ITT_BUILD */
7321   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7322 
7323   return rc;
7324 }
7325 
7326 void __kmp_teams_master(int gtid) {
7327   // This routine is called by all master threads in teams construct
7328   kmp_info_t *thr = __kmp_threads[gtid];
7329   kmp_team_t *team = thr->th.th_team;
7330   ident_t *loc = team->t.t_ident;
7331   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7332   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7333   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7334   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7335                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7336 
7337   // This thread is a new CG root.  Set up the proper variables.
7338   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7339   tmp->cg_root = thr; // Make thr the CG root
7340   // Init to thread limit that was stored when league masters were forked
7341   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7342   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7343   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7344                  " cg_nthreads to 1\n",
7345                  thr, tmp));
7346   tmp->up = thr->th.th_cg_roots;
7347   thr->th.th_cg_roots = tmp;
7348 
7349 // Launch league of teams now, but not let workers execute
7350 // (they hang on fork barrier until next parallel)
7351 #if INCLUDE_SSC_MARKS
7352   SSC_MARK_FORKING();
7353 #endif
7354   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7355                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7356                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7357 #if INCLUDE_SSC_MARKS
7358   SSC_MARK_JOINING();
7359 #endif
7360   // If the team size was reduced from the limit, set it to the new size
7361   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7362     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7363   // AC: last parameter "1" eliminates join barrier which won't work because
7364   // worker threads are in a fork barrier waiting for more parallel regions
7365   __kmp_join_call(loc, gtid
7366 #if OMPT_SUPPORT
7367                   ,
7368                   fork_context_intel
7369 #endif
7370                   ,
7371                   1);
7372 }
7373 
7374 int __kmp_invoke_teams_master(int gtid) {
7375   kmp_info_t *this_thr = __kmp_threads[gtid];
7376   kmp_team_t *team = this_thr->th.th_team;
7377 #if KMP_DEBUG
7378   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7379     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7380                      (void *)__kmp_teams_master);
7381 #endif
7382   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7383 #if OMPT_SUPPORT
7384   int tid = __kmp_tid_from_gtid(gtid);
7385   ompt_data_t *task_data =
7386       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7387   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7388   if (ompt_enabled.ompt_callback_implicit_task) {
7389     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7390         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7391         ompt_task_initial);
7392     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7393   }
7394 #endif
7395   __kmp_teams_master(gtid);
7396 #if OMPT_SUPPORT
7397   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7398 #endif
7399   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7400   return 1;
7401 }
7402 
7403 /* this sets the requested number of threads for the next parallel region
7404    encountered by this team. since this should be enclosed in the forkjoin
7405    critical section it should avoid race conditions with asymmetrical nested
7406    parallelism */
7407 
7408 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7409   kmp_info_t *thr = __kmp_threads[gtid];
7410 
7411   if (num_threads > 0)
7412     thr->th.th_set_nproc = num_threads;
7413 }
7414 
7415 /* this sets the requested number of teams for the teams region and/or
7416    the number of threads for the next parallel region encountered  */
7417 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7418                           int num_threads) {
7419   kmp_info_t *thr = __kmp_threads[gtid];
7420   KMP_DEBUG_ASSERT(num_teams >= 0);
7421   KMP_DEBUG_ASSERT(num_threads >= 0);
7422 
7423   if (num_teams == 0)
7424     num_teams = 1; // default number of teams is 1.
7425   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7426     if (!__kmp_reserve_warn) {
7427       __kmp_reserve_warn = 1;
7428       __kmp_msg(kmp_ms_warning,
7429                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7430                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7431     }
7432     num_teams = __kmp_teams_max_nth;
7433   }
7434   // Set number of teams (number of threads in the outer "parallel" of the
7435   // teams)
7436   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7437 
7438   // Remember the number of threads for inner parallel regions
7439   if (!TCR_4(__kmp_init_middle))
7440     __kmp_middle_initialize(); // get internal globals calculated
7441   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7442   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7443   if (num_threads == 0) {
7444     num_threads = __kmp_avail_proc / num_teams;
7445     // adjust num_threads w/o warning as it is not user setting
7446     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7447     // no thread_limit clause specified -  do not change thread-limit-var ICV
7448     if (num_threads > __kmp_dflt_team_nth) {
7449       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7450     }
7451     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7452       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7453     } // prevent team size to exceed thread-limit-var
7454     if (num_teams * num_threads > __kmp_teams_max_nth) {
7455       num_threads = __kmp_teams_max_nth / num_teams;
7456     }
7457   } else {
7458     // This thread will be the master of the league masters
7459     // Store new thread limit; old limit is saved in th_cg_roots list
7460     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7461     // num_threads = min(num_threads, nthreads-var)
7462     if (num_threads > __kmp_dflt_team_nth) {
7463       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7464     }
7465     if (num_teams * num_threads > __kmp_teams_max_nth) {
7466       int new_threads = __kmp_teams_max_nth / num_teams;
7467       if (!__kmp_reserve_warn) { // user asked for too many threads
7468         __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7469         __kmp_msg(kmp_ms_warning,
7470                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7471                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7472       }
7473       num_threads = new_threads;
7474     }
7475   }
7476   thr->th.th_teams_size.nth = num_threads;
7477 }
7478 
7479 // Set the proc_bind var to use in the following parallel region.
7480 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7481   kmp_info_t *thr = __kmp_threads[gtid];
7482   thr->th.th_set_proc_bind = proc_bind;
7483 }
7484 
7485 /* Launch the worker threads into the microtask. */
7486 
7487 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7488   kmp_info_t *this_thr = __kmp_threads[gtid];
7489 
7490 #ifdef KMP_DEBUG
7491   int f;
7492 #endif /* KMP_DEBUG */
7493 
7494   KMP_DEBUG_ASSERT(team);
7495   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7496   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7497   KMP_MB(); /* Flush all pending memory write invalidates.  */
7498 
7499   team->t.t_construct = 0; /* no single directives seen yet */
7500   team->t.t_ordered.dt.t_value =
7501       0; /* thread 0 enters the ordered section first */
7502 
7503   /* Reset the identifiers on the dispatch buffer */
7504   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7505   if (team->t.t_max_nproc > 1) {
7506     int i;
7507     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7508       team->t.t_disp_buffer[i].buffer_index = i;
7509       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7510     }
7511   } else {
7512     team->t.t_disp_buffer[0].buffer_index = 0;
7513     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7514   }
7515 
7516   KMP_MB(); /* Flush all pending memory write invalidates.  */
7517   KMP_ASSERT(this_thr->th.th_team == team);
7518 
7519 #ifdef KMP_DEBUG
7520   for (f = 0; f < team->t.t_nproc; f++) {
7521     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7522                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7523   }
7524 #endif /* KMP_DEBUG */
7525 
7526   /* release the worker threads so they may begin working */
7527   __kmp_fork_barrier(gtid, 0);
7528 }
7529 
7530 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7531   kmp_info_t *this_thr = __kmp_threads[gtid];
7532 
7533   KMP_DEBUG_ASSERT(team);
7534   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7535   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7536   KMP_MB(); /* Flush all pending memory write invalidates.  */
7537 
7538 /* Join barrier after fork */
7539 
7540 #ifdef KMP_DEBUG
7541   if (__kmp_threads[gtid] &&
7542       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7543     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7544                  __kmp_threads[gtid]);
7545     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7546                  "team->t.t_nproc=%d\n",
7547                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7548                  team->t.t_nproc);
7549     __kmp_print_structure();
7550   }
7551   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7552                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7553 #endif /* KMP_DEBUG */
7554 
7555   __kmp_join_barrier(gtid); /* wait for everyone */
7556 #if OMPT_SUPPORT
7557   if (ompt_enabled.enabled &&
7558       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7559     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7560     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7561     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7562 #if OMPT_OPTIONAL
7563     void *codeptr = NULL;
7564     if (KMP_MASTER_TID(ds_tid) &&
7565         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7566          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7567       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7568 
7569     if (ompt_enabled.ompt_callback_sync_region_wait) {
7570       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7571           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7572           codeptr);
7573     }
7574     if (ompt_enabled.ompt_callback_sync_region) {
7575       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7576           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7577           codeptr);
7578     }
7579 #endif
7580     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7581       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7582           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7583     }
7584   }
7585 #endif
7586 
7587   KMP_MB(); /* Flush all pending memory write invalidates.  */
7588   KMP_ASSERT(this_thr->th.th_team == team);
7589 }
7590 
7591 /* ------------------------------------------------------------------------ */
7592 
7593 #ifdef USE_LOAD_BALANCE
7594 
7595 // Return the worker threads actively spinning in the hot team, if we
7596 // are at the outermost level of parallelism.  Otherwise, return 0.
7597 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7598   int i;
7599   int retval;
7600   kmp_team_t *hot_team;
7601 
7602   if (root->r.r_active) {
7603     return 0;
7604   }
7605   hot_team = root->r.r_hot_team;
7606   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7607     return hot_team->t.t_nproc - 1; // Don't count master thread
7608   }
7609 
7610   // Skip the master thread - it is accounted for elsewhere.
7611   retval = 0;
7612   for (i = 1; i < hot_team->t.t_nproc; i++) {
7613     if (hot_team->t.t_threads[i]->th.th_active) {
7614       retval++;
7615     }
7616   }
7617   return retval;
7618 }
7619 
7620 // Perform an automatic adjustment to the number of
7621 // threads used by the next parallel region.
7622 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7623   int retval;
7624   int pool_active;
7625   int hot_team_active;
7626   int team_curr_active;
7627   int system_active;
7628 
7629   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7630                 set_nproc));
7631   KMP_DEBUG_ASSERT(root);
7632   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7633                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7634   KMP_DEBUG_ASSERT(set_nproc > 1);
7635 
7636   if (set_nproc == 1) {
7637     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7638     return 1;
7639   }
7640 
7641   // Threads that are active in the thread pool, active in the hot team for this
7642   // particular root (if we are at the outer par level), and the currently
7643   // executing thread (to become the master) are available to add to the new
7644   // team, but are currently contributing to the system load, and must be
7645   // accounted for.
7646   pool_active = __kmp_thread_pool_active_nth;
7647   hot_team_active = __kmp_active_hot_team_nproc(root);
7648   team_curr_active = pool_active + hot_team_active + 1;
7649 
7650   // Check the system load.
7651   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7652   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7653                 "hot team active = %d\n",
7654                 system_active, pool_active, hot_team_active));
7655 
7656   if (system_active < 0) {
7657     // There was an error reading the necessary info from /proc, so use the
7658     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7659     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7660     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7661     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7662 
7663     // Make this call behave like the thread limit algorithm.
7664     retval = __kmp_avail_proc - __kmp_nth +
7665              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7666     if (retval > set_nproc) {
7667       retval = set_nproc;
7668     }
7669     if (retval < KMP_MIN_NTH) {
7670       retval = KMP_MIN_NTH;
7671     }
7672 
7673     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7674                   retval));
7675     return retval;
7676   }
7677 
7678   // There is a slight delay in the load balance algorithm in detecting new
7679   // running procs. The real system load at this instant should be at least as
7680   // large as the #active omp thread that are available to add to the team.
7681   if (system_active < team_curr_active) {
7682     system_active = team_curr_active;
7683   }
7684   retval = __kmp_avail_proc - system_active + team_curr_active;
7685   if (retval > set_nproc) {
7686     retval = set_nproc;
7687   }
7688   if (retval < KMP_MIN_NTH) {
7689     retval = KMP_MIN_NTH;
7690   }
7691 
7692   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7693   return retval;
7694 } // __kmp_load_balance_nproc()
7695 
7696 #endif /* USE_LOAD_BALANCE */
7697 
7698 /* ------------------------------------------------------------------------ */
7699 
7700 /* NOTE: this is called with the __kmp_init_lock held */
7701 void __kmp_cleanup(void) {
7702   int f;
7703 
7704   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7705 
7706   if (TCR_4(__kmp_init_parallel)) {
7707 #if KMP_HANDLE_SIGNALS
7708     __kmp_remove_signals();
7709 #endif
7710     TCW_4(__kmp_init_parallel, FALSE);
7711   }
7712 
7713   if (TCR_4(__kmp_init_middle)) {
7714 #if KMP_AFFINITY_SUPPORTED
7715     __kmp_affinity_uninitialize();
7716 #endif /* KMP_AFFINITY_SUPPORTED */
7717     __kmp_cleanup_hierarchy();
7718     TCW_4(__kmp_init_middle, FALSE);
7719   }
7720 
7721   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7722 
7723   if (__kmp_init_serial) {
7724     __kmp_runtime_destroy();
7725     __kmp_init_serial = FALSE;
7726   }
7727 
7728   __kmp_cleanup_threadprivate_caches();
7729 
7730   for (f = 0; f < __kmp_threads_capacity; f++) {
7731     if (__kmp_root[f] != NULL) {
7732       __kmp_free(__kmp_root[f]);
7733       __kmp_root[f] = NULL;
7734     }
7735   }
7736   __kmp_free(__kmp_threads);
7737   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7738   // there is no need in freeing __kmp_root.
7739   __kmp_threads = NULL;
7740   __kmp_root = NULL;
7741   __kmp_threads_capacity = 0;
7742 
7743 #if KMP_USE_DYNAMIC_LOCK
7744   __kmp_cleanup_indirect_user_locks();
7745 #else
7746   __kmp_cleanup_user_locks();
7747 #endif
7748 
7749 #if KMP_AFFINITY_SUPPORTED
7750   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7751   __kmp_cpuinfo_file = NULL;
7752 #endif /* KMP_AFFINITY_SUPPORTED */
7753 
7754 #if KMP_USE_ADAPTIVE_LOCKS
7755 #if KMP_DEBUG_ADAPTIVE_LOCKS
7756   __kmp_print_speculative_stats();
7757 #endif
7758 #endif
7759   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7760   __kmp_nested_nth.nth = NULL;
7761   __kmp_nested_nth.size = 0;
7762   __kmp_nested_nth.used = 0;
7763   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7764   __kmp_nested_proc_bind.bind_types = NULL;
7765   __kmp_nested_proc_bind.size = 0;
7766   __kmp_nested_proc_bind.used = 0;
7767   if (__kmp_affinity_format) {
7768     KMP_INTERNAL_FREE(__kmp_affinity_format);
7769     __kmp_affinity_format = NULL;
7770   }
7771 
7772   __kmp_i18n_catclose();
7773 
7774 #if KMP_USE_HIER_SCHED
7775   __kmp_hier_scheds.deallocate();
7776 #endif
7777 
7778 #if KMP_STATS_ENABLED
7779   __kmp_stats_fini();
7780 #endif
7781 
7782   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7783 }
7784 
7785 /* ------------------------------------------------------------------------ */
7786 
7787 int __kmp_ignore_mppbeg(void) {
7788   char *env;
7789 
7790   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7791     if (__kmp_str_match_false(env))
7792       return FALSE;
7793   }
7794   // By default __kmpc_begin() is no-op.
7795   return TRUE;
7796 }
7797 
7798 int __kmp_ignore_mppend(void) {
7799   char *env;
7800 
7801   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7802     if (__kmp_str_match_false(env))
7803       return FALSE;
7804   }
7805   // By default __kmpc_end() is no-op.
7806   return TRUE;
7807 }
7808 
7809 void __kmp_internal_begin(void) {
7810   int gtid;
7811   kmp_root_t *root;
7812 
7813   /* this is a very important step as it will register new sibling threads
7814      and assign these new uber threads a new gtid */
7815   gtid = __kmp_entry_gtid();
7816   root = __kmp_threads[gtid]->th.th_root;
7817   KMP_ASSERT(KMP_UBER_GTID(gtid));
7818 
7819   if (root->r.r_begin)
7820     return;
7821   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7822   if (root->r.r_begin) {
7823     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7824     return;
7825   }
7826 
7827   root->r.r_begin = TRUE;
7828 
7829   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7830 }
7831 
7832 /* ------------------------------------------------------------------------ */
7833 
7834 void __kmp_user_set_library(enum library_type arg) {
7835   int gtid;
7836   kmp_root_t *root;
7837   kmp_info_t *thread;
7838 
7839   /* first, make sure we are initialized so we can get our gtid */
7840 
7841   gtid = __kmp_entry_gtid();
7842   thread = __kmp_threads[gtid];
7843 
7844   root = thread->th.th_root;
7845 
7846   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7847                 library_serial));
7848   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7849                                   thread */
7850     KMP_WARNING(SetLibraryIncorrectCall);
7851     return;
7852   }
7853 
7854   switch (arg) {
7855   case library_serial:
7856     thread->th.th_set_nproc = 0;
7857     set__nproc(thread, 1);
7858     break;
7859   case library_turnaround:
7860     thread->th.th_set_nproc = 0;
7861     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7862                                            : __kmp_dflt_team_nth_ub);
7863     break;
7864   case library_throughput:
7865     thread->th.th_set_nproc = 0;
7866     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7867                                            : __kmp_dflt_team_nth_ub);
7868     break;
7869   default:
7870     KMP_FATAL(UnknownLibraryType, arg);
7871   }
7872 
7873   __kmp_aux_set_library(arg);
7874 }
7875 
7876 void __kmp_aux_set_stacksize(size_t arg) {
7877   if (!__kmp_init_serial)
7878     __kmp_serial_initialize();
7879 
7880 #if KMP_OS_DARWIN
7881   if (arg & (0x1000 - 1)) {
7882     arg &= ~(0x1000 - 1);
7883     if (arg + 0x1000) /* check for overflow if we round up */
7884       arg += 0x1000;
7885   }
7886 #endif
7887   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7888 
7889   /* only change the default stacksize before the first parallel region */
7890   if (!TCR_4(__kmp_init_parallel)) {
7891     size_t value = arg; /* argument is in bytes */
7892 
7893     if (value < __kmp_sys_min_stksize)
7894       value = __kmp_sys_min_stksize;
7895     else if (value > KMP_MAX_STKSIZE)
7896       value = KMP_MAX_STKSIZE;
7897 
7898     __kmp_stksize = value;
7899 
7900     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7901   }
7902 
7903   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7904 }
7905 
7906 /* set the behaviour of the runtime library */
7907 /* TODO this can cause some odd behaviour with sibling parallelism... */
7908 void __kmp_aux_set_library(enum library_type arg) {
7909   __kmp_library = arg;
7910 
7911   switch (__kmp_library) {
7912   case library_serial: {
7913     KMP_INFORM(LibraryIsSerial);
7914   } break;
7915   case library_turnaround:
7916     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7917       __kmp_use_yield = 2; // only yield when oversubscribed
7918     break;
7919   case library_throughput:
7920     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7921       __kmp_dflt_blocktime = 200;
7922     break;
7923   default:
7924     KMP_FATAL(UnknownLibraryType, arg);
7925   }
7926 }
7927 
7928 /* Getting team information common for all team API */
7929 // Returns NULL if not in teams construct
7930 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7931   kmp_info_t *thr = __kmp_entry_thread();
7932   teams_serialized = 0;
7933   if (thr->th.th_teams_microtask) {
7934     kmp_team_t *team = thr->th.th_team;
7935     int tlevel = thr->th.th_teams_level; // the level of the teams construct
7936     int ii = team->t.t_level;
7937     teams_serialized = team->t.t_serialized;
7938     int level = tlevel + 1;
7939     KMP_DEBUG_ASSERT(ii >= tlevel);
7940     while (ii > level) {
7941       for (teams_serialized = team->t.t_serialized;
7942            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7943       }
7944       if (team->t.t_serialized && (!teams_serialized)) {
7945         team = team->t.t_parent;
7946         continue;
7947       }
7948       if (ii > level) {
7949         team = team->t.t_parent;
7950         ii--;
7951       }
7952     }
7953     return team;
7954   }
7955   return NULL;
7956 }
7957 
7958 int __kmp_aux_get_team_num() {
7959   int serialized;
7960   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7961   if (team) {
7962     if (serialized > 1) {
7963       return 0; // teams region is serialized ( 1 team of 1 thread ).
7964     } else {
7965       return team->t.t_master_tid;
7966     }
7967   }
7968   return 0;
7969 }
7970 
7971 int __kmp_aux_get_num_teams() {
7972   int serialized;
7973   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7974   if (team) {
7975     if (serialized > 1) {
7976       return 1;
7977     } else {
7978       return team->t.t_parent->t.t_nproc;
7979     }
7980   }
7981   return 1;
7982 }
7983 
7984 /* ------------------------------------------------------------------------ */
7985 
7986 /*
7987  * Affinity Format Parser
7988  *
7989  * Field is in form of: %[[[0].]size]type
7990  * % and type are required (%% means print a literal '%')
7991  * type is either single char or long name surrounded by {},
7992  * e.g., N or {num_threads}
7993  * 0 => leading zeros
7994  * . => right justified when size is specified
7995  * by default output is left justified
7996  * size is the *minimum* field length
7997  * All other characters are printed as is
7998  *
7999  * Available field types:
8000  * L {thread_level}      - omp_get_level()
8001  * n {thread_num}        - omp_get_thread_num()
8002  * h {host}              - name of host machine
8003  * P {process_id}        - process id (integer)
8004  * T {thread_identifier} - native thread identifier (integer)
8005  * N {num_threads}       - omp_get_num_threads()
8006  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8007  * a {thread_affinity}   - comma separated list of integers or integer ranges
8008  *                         (values of affinity mask)
8009  *
8010  * Implementation-specific field types can be added
8011  * If a type is unknown, print "undefined"
8012 */
8013 
8014 // Structure holding the short name, long name, and corresponding data type
8015 // for snprintf.  A table of these will represent the entire valid keyword
8016 // field types.
8017 typedef struct kmp_affinity_format_field_t {
8018   char short_name; // from spec e.g., L -> thread level
8019   const char *long_name; // from spec thread_level -> thread level
8020   char field_format; // data type for snprintf (typically 'd' or 's'
8021   // for integer or string)
8022 } kmp_affinity_format_field_t;
8023 
8024 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8025 #if KMP_AFFINITY_SUPPORTED
8026     {'A', "thread_affinity", 's'},
8027 #endif
8028     {'t', "team_num", 'd'},
8029     {'T', "num_teams", 'd'},
8030     {'L', "nesting_level", 'd'},
8031     {'n', "thread_num", 'd'},
8032     {'N', "num_threads", 'd'},
8033     {'a', "ancestor_tnum", 'd'},
8034     {'H', "host", 's'},
8035     {'P', "process_id", 'd'},
8036     {'i', "native_thread_id", 'd'}};
8037 
8038 // Return the number of characters it takes to hold field
8039 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8040                                             const char **ptr,
8041                                             kmp_str_buf_t *field_buffer) {
8042   int rc, format_index, field_value;
8043   const char *width_left, *width_right;
8044   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8045   static const int FORMAT_SIZE = 20;
8046   char format[FORMAT_SIZE] = {0};
8047   char absolute_short_name = 0;
8048 
8049   KMP_DEBUG_ASSERT(gtid >= 0);
8050   KMP_DEBUG_ASSERT(th);
8051   KMP_DEBUG_ASSERT(**ptr == '%');
8052   KMP_DEBUG_ASSERT(field_buffer);
8053 
8054   __kmp_str_buf_clear(field_buffer);
8055 
8056   // Skip the initial %
8057   (*ptr)++;
8058 
8059   // Check for %% first
8060   if (**ptr == '%') {
8061     __kmp_str_buf_cat(field_buffer, "%", 1);
8062     (*ptr)++; // skip over the second %
8063     return 1;
8064   }
8065 
8066   // Parse field modifiers if they are present
8067   pad_zeros = false;
8068   if (**ptr == '0') {
8069     pad_zeros = true;
8070     (*ptr)++; // skip over 0
8071   }
8072   right_justify = false;
8073   if (**ptr == '.') {
8074     right_justify = true;
8075     (*ptr)++; // skip over .
8076   }
8077   // Parse width of field: [width_left, width_right)
8078   width_left = width_right = NULL;
8079   if (**ptr >= '0' && **ptr <= '9') {
8080     width_left = *ptr;
8081     SKIP_DIGITS(*ptr);
8082     width_right = *ptr;
8083   }
8084 
8085   // Create the format for KMP_SNPRINTF based on flags parsed above
8086   format_index = 0;
8087   format[format_index++] = '%';
8088   if (!right_justify)
8089     format[format_index++] = '-';
8090   if (pad_zeros)
8091     format[format_index++] = '0';
8092   if (width_left && width_right) {
8093     int i = 0;
8094     // Only allow 8 digit number widths.
8095     // This also prevents overflowing format variable
8096     while (i < 8 && width_left < width_right) {
8097       format[format_index++] = *width_left;
8098       width_left++;
8099       i++;
8100     }
8101   }
8102 
8103   // Parse a name (long or short)
8104   // Canonicalize the name into absolute_short_name
8105   found_valid_name = false;
8106   parse_long_name = (**ptr == '{');
8107   if (parse_long_name)
8108     (*ptr)++; // skip initial left brace
8109   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8110                              sizeof(__kmp_affinity_format_table[0]);
8111        ++i) {
8112     char short_name = __kmp_affinity_format_table[i].short_name;
8113     const char *long_name = __kmp_affinity_format_table[i].long_name;
8114     char field_format = __kmp_affinity_format_table[i].field_format;
8115     if (parse_long_name) {
8116       size_t length = KMP_STRLEN(long_name);
8117       if (strncmp(*ptr, long_name, length) == 0) {
8118         found_valid_name = true;
8119         (*ptr) += length; // skip the long name
8120       }
8121     } else if (**ptr == short_name) {
8122       found_valid_name = true;
8123       (*ptr)++; // skip the short name
8124     }
8125     if (found_valid_name) {
8126       format[format_index++] = field_format;
8127       format[format_index++] = '\0';
8128       absolute_short_name = short_name;
8129       break;
8130     }
8131   }
8132   if (parse_long_name) {
8133     if (**ptr != '}') {
8134       absolute_short_name = 0;
8135     } else {
8136       (*ptr)++; // skip over the right brace
8137     }
8138   }
8139 
8140   // Attempt to fill the buffer with the requested
8141   // value using snprintf within __kmp_str_buf_print()
8142   switch (absolute_short_name) {
8143   case 't':
8144     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8145     break;
8146   case 'T':
8147     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8148     break;
8149   case 'L':
8150     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8151     break;
8152   case 'n':
8153     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8154     break;
8155   case 'H': {
8156     static const int BUFFER_SIZE = 256;
8157     char buf[BUFFER_SIZE];
8158     __kmp_expand_host_name(buf, BUFFER_SIZE);
8159     rc = __kmp_str_buf_print(field_buffer, format, buf);
8160   } break;
8161   case 'P':
8162     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8163     break;
8164   case 'i':
8165     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8166     break;
8167   case 'N':
8168     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8169     break;
8170   case 'a':
8171     field_value =
8172         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8173     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8174     break;
8175 #if KMP_AFFINITY_SUPPORTED
8176   case 'A': {
8177     kmp_str_buf_t buf;
8178     __kmp_str_buf_init(&buf);
8179     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8180     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8181     __kmp_str_buf_free(&buf);
8182   } break;
8183 #endif
8184   default:
8185     // According to spec, If an implementation does not have info for field
8186     // type, then "undefined" is printed
8187     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8188     // Skip the field
8189     if (parse_long_name) {
8190       SKIP_TOKEN(*ptr);
8191       if (**ptr == '}')
8192         (*ptr)++;
8193     } else {
8194       (*ptr)++;
8195     }
8196   }
8197 
8198   KMP_ASSERT(format_index <= FORMAT_SIZE);
8199   return rc;
8200 }
8201 
8202 /*
8203  * Return number of characters needed to hold the affinity string
8204  * (not including null byte character)
8205  * The resultant string is printed to buffer, which the caller can then
8206  * handle afterwards
8207 */
8208 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8209                                   kmp_str_buf_t *buffer) {
8210   const char *parse_ptr;
8211   size_t retval;
8212   const kmp_info_t *th;
8213   kmp_str_buf_t field;
8214 
8215   KMP_DEBUG_ASSERT(buffer);
8216   KMP_DEBUG_ASSERT(gtid >= 0);
8217 
8218   __kmp_str_buf_init(&field);
8219   __kmp_str_buf_clear(buffer);
8220 
8221   th = __kmp_threads[gtid];
8222   retval = 0;
8223 
8224   // If format is NULL or zero-length string, then we use
8225   // affinity-format-var ICV
8226   parse_ptr = format;
8227   if (parse_ptr == NULL || *parse_ptr == '\0') {
8228     parse_ptr = __kmp_affinity_format;
8229   }
8230   KMP_DEBUG_ASSERT(parse_ptr);
8231 
8232   while (*parse_ptr != '\0') {
8233     // Parse a field
8234     if (*parse_ptr == '%') {
8235       // Put field in the buffer
8236       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8237       __kmp_str_buf_catbuf(buffer, &field);
8238       retval += rc;
8239     } else {
8240       // Put literal character in buffer
8241       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8242       retval++;
8243       parse_ptr++;
8244     }
8245   }
8246   __kmp_str_buf_free(&field);
8247   return retval;
8248 }
8249 
8250 // Displays the affinity string to stdout
8251 void __kmp_aux_display_affinity(int gtid, const char *format) {
8252   kmp_str_buf_t buf;
8253   __kmp_str_buf_init(&buf);
8254   __kmp_aux_capture_affinity(gtid, format, &buf);
8255   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8256   __kmp_str_buf_free(&buf);
8257 }
8258 
8259 /* ------------------------------------------------------------------------ */
8260 
8261 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8262   int blocktime = arg; /* argument is in milliseconds */
8263 #if KMP_USE_MONITOR
8264   int bt_intervals;
8265 #endif
8266   kmp_int8 bt_set;
8267 
8268   __kmp_save_internal_controls(thread);
8269 
8270   /* Normalize and set blocktime for the teams */
8271   if (blocktime < KMP_MIN_BLOCKTIME)
8272     blocktime = KMP_MIN_BLOCKTIME;
8273   else if (blocktime > KMP_MAX_BLOCKTIME)
8274     blocktime = KMP_MAX_BLOCKTIME;
8275 
8276   set__blocktime_team(thread->th.th_team, tid, blocktime);
8277   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8278 
8279 #if KMP_USE_MONITOR
8280   /* Calculate and set blocktime intervals for the teams */
8281   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8282 
8283   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8284   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8285 #endif
8286 
8287   /* Set whether blocktime has been set to "TRUE" */
8288   bt_set = TRUE;
8289 
8290   set__bt_set_team(thread->th.th_team, tid, bt_set);
8291   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8292 #if KMP_USE_MONITOR
8293   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8294                 "bt_intervals=%d, monitor_updates=%d\n",
8295                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8296                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8297                 __kmp_monitor_wakeups));
8298 #else
8299   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8300                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8301                 thread->th.th_team->t.t_id, tid, blocktime));
8302 #endif
8303 }
8304 
8305 void __kmp_aux_set_defaults(char const *str, size_t len) {
8306   if (!__kmp_init_serial) {
8307     __kmp_serial_initialize();
8308   }
8309   __kmp_env_initialize(str);
8310 
8311   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8312     __kmp_env_print();
8313   }
8314 } // __kmp_aux_set_defaults
8315 
8316 /* ------------------------------------------------------------------------ */
8317 /* internal fast reduction routines */
8318 
8319 PACKED_REDUCTION_METHOD_T
8320 __kmp_determine_reduction_method(
8321     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8322     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8323     kmp_critical_name *lck) {
8324 
8325   // Default reduction method: critical construct ( lck != NULL, like in current
8326   // PAROPT )
8327   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8328   // can be selected by RTL
8329   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8330   // can be selected by RTL
8331   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8332   // among generated by PAROPT.
8333 
8334   PACKED_REDUCTION_METHOD_T retval;
8335 
8336   int team_size;
8337 
8338   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8339   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8340 
8341 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8342   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8343 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8344 
8345   retval = critical_reduce_block;
8346 
8347   // another choice of getting a team size (with 1 dynamic deference) is slower
8348   team_size = __kmp_get_team_num_threads(global_tid);
8349   if (team_size == 1) {
8350 
8351     retval = empty_reduce_block;
8352 
8353   } else {
8354 
8355     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8356 
8357 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8358     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8359 
8360 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8361     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8362 
8363     int teamsize_cutoff = 4;
8364 
8365 #if KMP_MIC_SUPPORTED
8366     if (__kmp_mic_type != non_mic) {
8367       teamsize_cutoff = 8;
8368     }
8369 #endif
8370     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8371     if (tree_available) {
8372       if (team_size <= teamsize_cutoff) {
8373         if (atomic_available) {
8374           retval = atomic_reduce_block;
8375         }
8376       } else {
8377         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8378       }
8379     } else if (atomic_available) {
8380       retval = atomic_reduce_block;
8381     }
8382 #else
8383 #error "Unknown or unsupported OS"
8384 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8385        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8386 
8387 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8388 
8389 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8390 
8391     // basic tuning
8392 
8393     if (atomic_available) {
8394       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8395         retval = atomic_reduce_block;
8396       }
8397     } // otherwise: use critical section
8398 
8399 #elif KMP_OS_DARWIN
8400 
8401     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8402     if (atomic_available && (num_vars <= 3)) {
8403       retval = atomic_reduce_block;
8404     } else if (tree_available) {
8405       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8406           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8407         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8408       }
8409     } // otherwise: use critical section
8410 
8411 #else
8412 #error "Unknown or unsupported OS"
8413 #endif
8414 
8415 #else
8416 #error "Unknown or unsupported architecture"
8417 #endif
8418   }
8419 
8420   // KMP_FORCE_REDUCTION
8421 
8422   // If the team is serialized (team_size == 1), ignore the forced reduction
8423   // method and stay with the unsynchronized method (empty_reduce_block)
8424   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8425       team_size != 1) {
8426 
8427     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8428 
8429     int atomic_available, tree_available;
8430 
8431     switch ((forced_retval = __kmp_force_reduction_method)) {
8432     case critical_reduce_block:
8433       KMP_ASSERT(lck); // lck should be != 0
8434       break;
8435 
8436     case atomic_reduce_block:
8437       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8438       if (!atomic_available) {
8439         KMP_WARNING(RedMethodNotSupported, "atomic");
8440         forced_retval = critical_reduce_block;
8441       }
8442       break;
8443 
8444     case tree_reduce_block:
8445       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8446       if (!tree_available) {
8447         KMP_WARNING(RedMethodNotSupported, "tree");
8448         forced_retval = critical_reduce_block;
8449       } else {
8450 #if KMP_FAST_REDUCTION_BARRIER
8451         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8452 #endif
8453       }
8454       break;
8455 
8456     default:
8457       KMP_ASSERT(0); // "unsupported method specified"
8458     }
8459 
8460     retval = forced_retval;
8461   }
8462 
8463   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8464 
8465 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8466 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8467 
8468   return (retval);
8469 }
8470 // this function is for testing set/get/determine reduce method
8471 kmp_int32 __kmp_get_reduce_method(void) {
8472   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8473 }
8474 
8475 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8476 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8477 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8478 
8479 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8480 // OpenMP is used subsequently.
8481 void __kmp_hard_pause() {
8482   __kmp_pause_status = kmp_hard_paused;
8483   __kmp_internal_end_thread(-1);
8484 }
8485 
8486 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8487 void __kmp_resume_if_soft_paused() {
8488   if (__kmp_pause_status == kmp_soft_paused) {
8489     __kmp_pause_status = kmp_not_paused;
8490 
8491     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8492       kmp_info_t *thread = __kmp_threads[gtid];
8493       if (thread) { // Wake it if sleeping
8494         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8495                          thread);
8496         if (fl.is_sleeping())
8497           fl.resume(gtid);
8498         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8499           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8500         } else { // thread holds the lock and may sleep soon
8501           do { // until either the thread sleeps, or we can get the lock
8502             if (fl.is_sleeping()) {
8503               fl.resume(gtid);
8504               break;
8505             } else if (__kmp_try_suspend_mx(thread)) {
8506               __kmp_unlock_suspend_mx(thread);
8507               break;
8508             }
8509           } while (1);
8510         }
8511       }
8512     }
8513   }
8514 }
8515 
8516 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8517 // TODO: add warning messages
8518 int __kmp_pause_resource(kmp_pause_status_t level) {
8519   if (level == kmp_not_paused) { // requesting resume
8520     if (__kmp_pause_status == kmp_not_paused) {
8521       // error message about runtime not being paused, so can't resume
8522       return 1;
8523     } else {
8524       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8525                        __kmp_pause_status == kmp_hard_paused);
8526       __kmp_pause_status = kmp_not_paused;
8527       return 0;
8528     }
8529   } else if (level == kmp_soft_paused) { // requesting soft pause
8530     if (__kmp_pause_status != kmp_not_paused) {
8531       // error message about already being paused
8532       return 1;
8533     } else {
8534       __kmp_soft_pause();
8535       return 0;
8536     }
8537   } else if (level == kmp_hard_paused) { // requesting hard pause
8538     if (__kmp_pause_status != kmp_not_paused) {
8539       // error message about already being paused
8540       return 1;
8541     } else {
8542       __kmp_hard_pause();
8543       return 0;
8544     }
8545   } else {
8546     // error message about invalid level
8547     return 1;
8548   }
8549 }
8550 
8551 void __kmp_omp_display_env(int verbose) {
8552   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8553   if (__kmp_init_serial == 0)
8554     __kmp_do_serial_initialize();
8555   __kmp_display_env_impl(!verbose, verbose);
8556   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8557 }
8558 
8559 // Globals and functions for hidden helper task
8560 kmp_info_t **__kmp_hidden_helper_threads;
8561 kmp_info_t *__kmp_hidden_helper_main_thread;
8562 kmp_int32 __kmp_hidden_helper_threads_num = 8;
8563 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8564 #if KMP_OS_LINUX
8565 kmp_int32 __kmp_enable_hidden_helper = TRUE;
8566 #else
8567 kmp_int32 __kmp_enable_hidden_helper = FALSE;
8568 #endif
8569 
8570 namespace {
8571 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8572 
8573 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8574   // This is an explicit synchronization on all hidden helper threads in case
8575   // that when a regular thread pushes a hidden helper task to one hidden
8576   // helper thread, the thread has not been awaken once since they're released
8577   // by the main thread after creating the team.
8578   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8579   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8580          __kmp_hidden_helper_threads_num)
8581     ;
8582 
8583   // If main thread, then wait for signal
8584   if (__kmpc_master(nullptr, *gtid)) {
8585     // First, unset the initial state and release the initial thread
8586     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8587     __kmp_hidden_helper_initz_release();
8588     __kmp_hidden_helper_main_thread_wait();
8589     // Now wake up all worker threads
8590     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8591       __kmp_hidden_helper_worker_thread_signal();
8592     }
8593   }
8594 }
8595 } // namespace
8596 
8597 void __kmp_hidden_helper_threads_initz_routine() {
8598   // Create a new root for hidden helper team/threads
8599   const int gtid = __kmp_register_root(TRUE);
8600   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8601   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
8602   __kmp_hidden_helper_main_thread->th.th_set_nproc =
8603       __kmp_hidden_helper_threads_num;
8604 
8605   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8606 
8607   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8608 
8609   // Set the initialization flag to FALSE
8610   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8611 
8612   __kmp_hidden_helper_threads_deinitz_release();
8613 }
8614