xref: /freebsd-src/contrib/llvm-project/openmp/runtime/src/kmp_csupport.cpp (revision 1db9f3b21e39176dd5b67cf8ac378633b172463e)
1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
28 /*!
29  * @ingroup STARTUP_SHUTDOWN
30  * @param loc   in   source location information
31  * @param flags in   for future use (currently ignored)
32  *
33  * Initialize the runtime library. This call is optional; if it is not made then
34  * it will be implicitly called by attempts to use other library functions.
35  */
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37   // By default __kmpc_begin() is no-op.
38   char *env;
39   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40       __kmp_str_match_true(env)) {
41     __kmp_middle_initialize();
42     __kmp_assign_root_init_mask();
43     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
44   } else if (__kmp_ignore_mppbeg() == FALSE) {
45     // By default __kmp_ignore_mppbeg() returns TRUE.
46     __kmp_internal_begin();
47     KC_TRACE(10, ("__kmpc_begin: called\n"));
48   }
49 }
50 
51 /*!
52  * @ingroup STARTUP_SHUTDOWN
53  * @param loc source location information
54  *
55  * Shutdown the runtime library. This is also optional, and even if called will
56  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
57  * zero.
58  */
59 void __kmpc_end(ident_t *loc) {
60   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
61   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
62   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
63   // returns FALSE and __kmpc_end() will unregister this root (it can cause
64   // library shut down).
65   if (__kmp_ignore_mppend() == FALSE) {
66     KC_TRACE(10, ("__kmpc_end: called\n"));
67     KA_TRACE(30, ("__kmpc_end\n"));
68 
69     __kmp_internal_end_thread(-1);
70   }
71 #if KMP_OS_WINDOWS && OMPT_SUPPORT
72   // Normal exit process on Windows does not allow worker threads of the final
73   // parallel region to finish reporting their events, so shutting down the
74   // library here fixes the issue at least for the cases where __kmpc_end() is
75   // placed properly.
76   if (ompt_enabled.enabled)
77     __kmp_internal_end_library(__kmp_gtid_get_specific());
78 #endif
79 }
80 
81 /*!
82 @ingroup THREAD_STATES
83 @param loc Source location information.
84 @return The global thread index of the active thread.
85 
86 This function can be called in any context.
87 
88 If the runtime has ony been entered at the outermost level from a
89 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
90 that which would be returned by omp_get_thread_num() in the outermost
91 active parallel construct. (Or zero if there is no active parallel
92 construct, since the primary thread is necessarily thread zero).
93 
94 If multiple non-OpenMP threads all enter an OpenMP construct then this
95 will be a unique thread identifier among all the threads created by
96 the OpenMP runtime (but the value cannot be defined in terms of
97 OpenMP thread ids returned by omp_get_thread_num()).
98 */
99 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
100   kmp_int32 gtid = __kmp_entry_gtid();
101 
102   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
103 
104   return gtid;
105 }
106 
107 /*!
108 @ingroup THREAD_STATES
109 @param loc Source location information.
110 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
111 
112 This function can be called in any context.
113 It returns the total number of threads under the control of the OpenMP runtime.
114 That is not a number that can be determined by any OpenMP standard calls, since
115 the library may be called from more than one non-OpenMP thread, and this
116 reflects the total over all such calls. Similarly the runtime maintains
117 underlying threads even when they are not active (since the cost of creating
118 and destroying OS threads is high), this call counts all such threads even if
119 they are not waiting for work.
120 */
121 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
122   KC_TRACE(10,
123            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
124 
125   return TCR_4(__kmp_all_nth);
126 }
127 
128 /*!
129 @ingroup THREAD_STATES
130 @param loc Source location information.
131 @return The thread number of the calling thread in the innermost active parallel
132 construct.
133 */
134 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
135   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
136   return __kmp_tid_from_gtid(__kmp_entry_gtid());
137 }
138 
139 /*!
140 @ingroup THREAD_STATES
141 @param loc Source location information.
142 @return The number of threads in the innermost active parallel construct.
143 */
144 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
145   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
146 
147   return __kmp_entry_thread()->th.th_team->t.t_nproc;
148 }
149 
150 /*!
151  * @ingroup DEPRECATED
152  * @param loc location description
153  *
154  * This function need not be called. It always returns TRUE.
155  */
156 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
157 #ifndef KMP_DEBUG
158 
159   return TRUE;
160 
161 #else
162 
163   const char *semi2;
164   const char *semi3;
165   int line_no;
166 
167   if (__kmp_par_range == 0) {
168     return TRUE;
169   }
170   semi2 = loc->psource;
171   if (semi2 == NULL) {
172     return TRUE;
173   }
174   semi2 = strchr(semi2, ';');
175   if (semi2 == NULL) {
176     return TRUE;
177   }
178   semi2 = strchr(semi2 + 1, ';');
179   if (semi2 == NULL) {
180     return TRUE;
181   }
182   if (__kmp_par_range_filename[0]) {
183     const char *name = semi2 - 1;
184     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
185       name--;
186     }
187     if ((*name == '/') || (*name == ';')) {
188       name++;
189     }
190     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
191       return __kmp_par_range < 0;
192     }
193   }
194   semi3 = strchr(semi2 + 1, ';');
195   if (__kmp_par_range_routine[0]) {
196     if ((semi3 != NULL) && (semi3 > semi2) &&
197         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
198       return __kmp_par_range < 0;
199     }
200   }
201   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
202     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
203       return __kmp_par_range > 0;
204     }
205     return __kmp_par_range < 0;
206   }
207   return TRUE;
208 
209 #endif /* KMP_DEBUG */
210 }
211 
212 /*!
213 @ingroup THREAD_STATES
214 @param loc Source location information.
215 @return 1 if this thread is executing inside an active parallel region, zero if
216 not.
217 */
218 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
219   return __kmp_entry_thread()->th.th_root->r.r_active;
220 }
221 
222 /*!
223 @ingroup PARALLEL
224 @param loc source location information
225 @param global_tid global thread number
226 @param num_threads number of threads requested for this parallel construct
227 
228 Set the number of threads to be used by the next fork spawned by this thread.
229 This call is only required if the parallel construct has a `num_threads` clause.
230 */
231 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
232                              kmp_int32 num_threads) {
233   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
234                 global_tid, num_threads));
235   __kmp_assert_valid_gtid(global_tid);
236   __kmp_push_num_threads(loc, global_tid, num_threads);
237 }
238 
239 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
240   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
241   /* the num_threads are automatically popped */
242 }
243 
244 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245                            kmp_int32 proc_bind) {
246   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
247                 proc_bind));
248   __kmp_assert_valid_gtid(global_tid);
249   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
250 }
251 
252 /*!
253 @ingroup PARALLEL
254 @param loc  source location information
255 @param argc  total number of arguments in the ellipsis
256 @param microtask  pointer to callback routine consisting of outlined parallel
257 construct
258 @param ...  pointers to shared variables that aren't global
259 
260 Do the actual fork and call the microtask in the relevant number of threads.
261 */
262 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
263   int gtid = __kmp_entry_gtid();
264 
265 #if (KMP_STATS_ENABLED)
266   // If we were in a serial region, then stop the serial timer, record
267   // the event, and start parallel region timer
268   stats_state_e previous_state = KMP_GET_THREAD_STATE();
269   if (previous_state == stats_state_e::SERIAL_REGION) {
270     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
271   } else {
272     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
273   }
274   int inParallel = __kmpc_in_parallel(loc);
275   if (inParallel) {
276     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
277   } else {
278     KMP_COUNT_BLOCK(OMP_PARALLEL);
279   }
280 #endif
281 
282   // maybe to save thr_state is enough here
283   {
284     va_list ap;
285     va_start(ap, microtask);
286 
287 #if OMPT_SUPPORT
288     ompt_frame_t *ompt_frame;
289     if (ompt_enabled.enabled) {
290       kmp_info_t *master_th = __kmp_threads[gtid];
291       ompt_frame = &master_th->th.th_current_task->ompt_task_info.frame;
292       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
293     }
294     OMPT_STORE_RETURN_ADDRESS(gtid);
295 #endif
296 
297 #if INCLUDE_SSC_MARKS
298     SSC_MARK_FORKING();
299 #endif
300     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
301                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
302                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
303                     kmp_va_addr_of(ap));
304 #if INCLUDE_SSC_MARKS
305     SSC_MARK_JOINING();
306 #endif
307     __kmp_join_call(loc, gtid
308 #if OMPT_SUPPORT
309                     ,
310                     fork_context_intel
311 #endif
312     );
313 
314     va_end(ap);
315 
316 #if OMPT_SUPPORT
317     if (ompt_enabled.enabled) {
318       ompt_frame->enter_frame = ompt_data_none;
319     }
320 #endif
321   }
322 
323 #if KMP_STATS_ENABLED
324   if (previous_state == stats_state_e::SERIAL_REGION) {
325     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
326     KMP_SET_THREAD_STATE(previous_state);
327   } else {
328     KMP_POP_PARTITIONED_TIMER();
329   }
330 #endif // KMP_STATS_ENABLED
331 }
332 
333 /*!
334 @ingroup PARALLEL
335 @param loc  source location information
336 @param microtask  pointer to callback routine consisting of outlined parallel
337 construct
338 @param cond  condition for running in parallel
339 @param args  struct of pointers to shared variables that aren't global
340 
341 Perform a fork only if the condition is true.
342 */
343 void __kmpc_fork_call_if(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
344                          kmp_int32 cond, void *args) {
345   int gtid = __kmp_entry_gtid();
346   if (cond) {
347     if (args)
348       __kmpc_fork_call(loc, argc, microtask, args);
349     else
350       __kmpc_fork_call(loc, argc, microtask);
351   } else {
352     __kmpc_serialized_parallel(loc, gtid);
353 
354 #if OMPT_SUPPORT
355     void *exit_frame_ptr;
356 #endif
357 
358     if (args)
359       __kmp_invoke_microtask(VOLATILE_CAST(microtask_t) microtask, gtid,
360                              /*npr=*/0,
361                              /*argc=*/1, &args
362 #if OMPT_SUPPORT
363                              ,
364                              &exit_frame_ptr
365 #endif
366       );
367     else
368       __kmp_invoke_microtask(VOLATILE_CAST(microtask_t) microtask, gtid,
369                              /*npr=*/0,
370                              /*argc=*/0,
371                              /*args=*/nullptr
372 #if OMPT_SUPPORT
373                              ,
374                              &exit_frame_ptr
375 #endif
376       );
377 
378     __kmpc_end_serialized_parallel(loc, gtid);
379   }
380 }
381 
382 /*!
383 @ingroup PARALLEL
384 @param loc source location information
385 @param global_tid global thread number
386 @param num_teams number of teams requested for the teams construct
387 @param num_threads number of threads per team requested for the teams construct
388 
389 Set the number of teams to be used by the teams construct.
390 This call is only required if the teams construct has a `num_teams` clause
391 or a `thread_limit` clause (or both).
392 */
393 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
394                            kmp_int32 num_teams, kmp_int32 num_threads) {
395   KA_TRACE(20,
396            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
397             global_tid, num_teams, num_threads));
398   __kmp_assert_valid_gtid(global_tid);
399   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
400 }
401 
402 /*!
403 @ingroup PARALLEL
404 @param loc source location information
405 @param global_tid global thread number
406 @param thread_limit limit on number of threads which can be created within the
407 current task
408 
409 Set the thread_limit for the current task
410 This call is there to support `thread_limit` clause on the `target` construct
411 */
412 void __kmpc_set_thread_limit(ident_t *loc, kmp_int32 global_tid,
413                              kmp_int32 thread_limit) {
414   __kmp_assert_valid_gtid(global_tid);
415   kmp_info_t *thread = __kmp_threads[global_tid];
416   if (thread_limit > 0)
417     thread->th.th_current_task->td_icvs.task_thread_limit = thread_limit;
418 }
419 
420 /*!
421 @ingroup PARALLEL
422 @param loc source location information
423 @param global_tid global thread number
424 @param num_teams_lb lower bound on number of teams requested for the teams
425 construct
426 @param num_teams_ub upper bound on number of teams requested for the teams
427 construct
428 @param num_threads number of threads per team requested for the teams construct
429 
430 Set the number of teams to be used by the teams construct. The number of initial
431 teams cretaed will be greater than or equal to the lower bound and less than or
432 equal to the upper bound.
433 This call is only required if the teams construct has a `num_teams` clause
434 or a `thread_limit` clause (or both).
435 */
436 void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
437                               kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
438                               kmp_int32 num_threads) {
439   KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
440                 " num_teams_ub=%d num_threads=%d\n",
441                 global_tid, num_teams_lb, num_teams_ub, num_threads));
442   __kmp_assert_valid_gtid(global_tid);
443   __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
444                           num_threads);
445 }
446 
447 /*!
448 @ingroup PARALLEL
449 @param loc  source location information
450 @param argc  total number of arguments in the ellipsis
451 @param microtask  pointer to callback routine consisting of outlined teams
452 construct
453 @param ...  pointers to shared variables that aren't global
454 
455 Do the actual fork and call the microtask in the relevant number of threads.
456 */
457 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
458                        ...) {
459   int gtid = __kmp_entry_gtid();
460   kmp_info_t *this_thr = __kmp_threads[gtid];
461   va_list ap;
462   va_start(ap, microtask);
463 
464 #if KMP_STATS_ENABLED
465   KMP_COUNT_BLOCK(OMP_TEAMS);
466   stats_state_e previous_state = KMP_GET_THREAD_STATE();
467   if (previous_state == stats_state_e::SERIAL_REGION) {
468     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
469   } else {
470     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
471   }
472 #endif
473 
474   // remember teams entry point and nesting level
475   this_thr->th.th_teams_microtask = microtask;
476   this_thr->th.th_teams_level =
477       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
478 
479 #if OMPT_SUPPORT
480   kmp_team_t *parent_team = this_thr->th.th_team;
481   int tid = __kmp_tid_from_gtid(gtid);
482   if (ompt_enabled.enabled) {
483     parent_team->t.t_implicit_task_taskdata[tid]
484         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
485   }
486   OMPT_STORE_RETURN_ADDRESS(gtid);
487 #endif
488 
489   // check if __kmpc_push_num_teams called, set default number of teams
490   // otherwise
491   if (this_thr->th.th_teams_size.nteams == 0) {
492     __kmp_push_num_teams(loc, gtid, 0, 0);
493   }
494   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
495   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
496   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
497 
498   __kmp_fork_call(
499       loc, gtid, fork_context_intel, argc,
500       VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
501       VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
502   __kmp_join_call(loc, gtid
503 #if OMPT_SUPPORT
504                   ,
505                   fork_context_intel
506 #endif
507   );
508 
509   // Pop current CG root off list
510   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
511   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
512   this_thr->th.th_cg_roots = tmp->up;
513   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
514                  " to node %p. cg_nthreads was %d\n",
515                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
516   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
517   int i = tmp->cg_nthreads--;
518   if (i == 1) { // check is we are the last thread in CG (not always the case)
519     __kmp_free(tmp);
520   }
521   // Restore current task's thread_limit from CG root
522   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
523   this_thr->th.th_current_task->td_icvs.thread_limit =
524       this_thr->th.th_cg_roots->cg_thread_limit;
525 
526   this_thr->th.th_teams_microtask = NULL;
527   this_thr->th.th_teams_level = 0;
528   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
529   va_end(ap);
530 #if KMP_STATS_ENABLED
531   if (previous_state == stats_state_e::SERIAL_REGION) {
532     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
533     KMP_SET_THREAD_STATE(previous_state);
534   } else {
535     KMP_POP_PARTITIONED_TIMER();
536   }
537 #endif // KMP_STATS_ENABLED
538 }
539 
540 // I don't think this function should ever have been exported.
541 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
542 // openmp code ever called it, but it's been exported from the RTL for so
543 // long that I'm afraid to remove the definition.
544 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
545 
546 /*!
547 @ingroup PARALLEL
548 @param loc  source location information
549 @param global_tid  global thread number
550 
551 Enter a serialized parallel construct. This interface is used to handle a
552 conditional parallel region, like this,
553 @code
554 #pragma omp parallel if (condition)
555 @endcode
556 when the condition is false.
557 */
558 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
559   // The implementation is now in kmp_runtime.cpp so that it can share static
560   // functions with kmp_fork_call since the tasks to be done are similar in
561   // each case.
562   __kmp_assert_valid_gtid(global_tid);
563 #if OMPT_SUPPORT
564   OMPT_STORE_RETURN_ADDRESS(global_tid);
565 #endif
566   __kmp_serialized_parallel(loc, global_tid);
567 }
568 
569 /*!
570 @ingroup PARALLEL
571 @param loc  source location information
572 @param global_tid  global thread number
573 
574 Leave a serialized parallel construct.
575 */
576 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
577   kmp_internal_control_t *top;
578   kmp_info_t *this_thr;
579   kmp_team_t *serial_team;
580 
581   KC_TRACE(10,
582            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
583 
584   /* skip all this code for autopar serialized loops since it results in
585      unacceptable overhead */
586   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
587     return;
588 
589   // Not autopar code
590   __kmp_assert_valid_gtid(global_tid);
591   if (!TCR_4(__kmp_init_parallel))
592     __kmp_parallel_initialize();
593 
594   __kmp_resume_if_soft_paused();
595 
596   this_thr = __kmp_threads[global_tid];
597   serial_team = this_thr->th.th_serial_team;
598 
599   kmp_task_team_t *task_team = this_thr->th.th_task_team;
600   // we need to wait for the proxy tasks before finishing the thread
601   if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
602                             task_team->tt.tt_hidden_helper_task_encountered))
603     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
604 
605   KMP_MB();
606   KMP_DEBUG_ASSERT(serial_team);
607   KMP_ASSERT(serial_team->t.t_serialized);
608   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
609   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
610   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
611   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
612 
613 #if OMPT_SUPPORT
614   if (ompt_enabled.enabled &&
615       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
616     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
617     if (ompt_enabled.ompt_callback_implicit_task) {
618       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
619           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
620           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
621     }
622 
623     // reset clear the task id only after unlinking the task
624     ompt_data_t *parent_task_data;
625     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
626 
627     if (ompt_enabled.ompt_callback_parallel_end) {
628       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
629           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
630           ompt_parallel_invoker_program | ompt_parallel_team,
631           OMPT_LOAD_RETURN_ADDRESS(global_tid));
632     }
633     __ompt_lw_taskteam_unlink(this_thr);
634     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
635   }
636 #endif
637 
638   /* If necessary, pop the internal control stack values and replace the team
639    * values */
640   top = serial_team->t.t_control_stack_top;
641   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
642     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
643     serial_team->t.t_control_stack_top = top->next;
644     __kmp_free(top);
645   }
646 
647   /* pop dispatch buffers stack */
648   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
649   {
650     dispatch_private_info_t *disp_buffer =
651         serial_team->t.t_dispatch->th_disp_buffer;
652     serial_team->t.t_dispatch->th_disp_buffer =
653         serial_team->t.t_dispatch->th_disp_buffer->next;
654     __kmp_free(disp_buffer);
655   }
656   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
657 
658   --serial_team->t.t_serialized;
659   if (serial_team->t.t_serialized == 0) {
660 
661     /* return to the parallel section */
662 
663 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
664     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
665       __kmp_clear_x87_fpu_status_word();
666       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
667       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
668     }
669 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
670 
671     __kmp_pop_current_task_from_thread(this_thr);
672 #if OMPD_SUPPORT
673     if (ompd_state & OMPD_ENABLE_BP)
674       ompd_bp_parallel_end();
675 #endif
676 
677     this_thr->th.th_team = serial_team->t.t_parent;
678     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
679 
680     /* restore values cached in the thread */
681     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
682     this_thr->th.th_team_master =
683         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
684     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
685 
686     /* TODO the below shouldn't need to be adjusted for serialized teams */
687     this_thr->th.th_dispatch =
688         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
689 
690     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
691     this_thr->th.th_current_task->td_flags.executing = 1;
692 
693     if (__kmp_tasking_mode != tskm_immediate_exec) {
694       // Copy the task team from the new child / old parent team to the thread.
695       this_thr->th.th_task_team =
696           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
697       KA_TRACE(20,
698                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
699                 "team %p\n",
700                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
701     }
702 #if KMP_AFFINITY_SUPPORTED
703     if (this_thr->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
704       __kmp_reset_root_init_mask(global_tid);
705     }
706 #endif
707   } else {
708     if (__kmp_tasking_mode != tskm_immediate_exec) {
709       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
710                     "depth of serial team %p to %d\n",
711                     global_tid, serial_team, serial_team->t.t_serialized));
712     }
713   }
714 
715   serial_team->t.t_level--;
716   if (__kmp_env_consistency_check)
717     __kmp_pop_parallel(global_tid, NULL);
718 #if OMPT_SUPPORT
719   if (ompt_enabled.enabled)
720     this_thr->th.ompt_thread_info.state =
721         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
722                                            : ompt_state_work_parallel);
723 #endif
724 }
725 
726 /*!
727 @ingroup SYNCHRONIZATION
728 @param loc  source location information.
729 
730 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
731 depending on the memory ordering convention obeyed by the compiler
732 even that may not be necessary).
733 */
734 void __kmpc_flush(ident_t *loc) {
735   KC_TRACE(10, ("__kmpc_flush: called\n"));
736 
737   /* need explicit __mf() here since use volatile instead in library */
738   KMP_MFENCE(); /* Flush all pending memory write invalidates.  */
739 
740 #if OMPT_SUPPORT && OMPT_OPTIONAL
741   if (ompt_enabled.ompt_callback_flush) {
742     ompt_callbacks.ompt_callback(ompt_callback_flush)(
743         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
744   }
745 #endif
746 }
747 
748 /* -------------------------------------------------------------------------- */
749 /*!
750 @ingroup SYNCHRONIZATION
751 @param loc source location information
752 @param global_tid thread id.
753 
754 Execute a barrier.
755 */
756 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
757   KMP_COUNT_BLOCK(OMP_BARRIER);
758   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
759   __kmp_assert_valid_gtid(global_tid);
760 
761   if (!TCR_4(__kmp_init_parallel))
762     __kmp_parallel_initialize();
763 
764   __kmp_resume_if_soft_paused();
765 
766   if (__kmp_env_consistency_check) {
767     if (loc == 0) {
768       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
769     }
770     __kmp_check_barrier(global_tid, ct_barrier, loc);
771   }
772 
773 #if OMPT_SUPPORT
774   ompt_frame_t *ompt_frame;
775   if (ompt_enabled.enabled) {
776     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
777     if (ompt_frame->enter_frame.ptr == NULL)
778       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
779   }
780   OMPT_STORE_RETURN_ADDRESS(global_tid);
781 #endif
782   __kmp_threads[global_tid]->th.th_ident = loc;
783   // TODO: explicit barrier_wait_id:
784   //   this function is called when 'barrier' directive is present or
785   //   implicit barrier at the end of a worksharing construct.
786   // 1) better to add a per-thread barrier counter to a thread data structure
787   // 2) set to 0 when a new team is created
788   // 4) no sync is required
789 
790   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
791 #if OMPT_SUPPORT && OMPT_OPTIONAL
792   if (ompt_enabled.enabled) {
793     ompt_frame->enter_frame = ompt_data_none;
794   }
795 #endif
796 }
797 
798 /* The BARRIER for a MASTER section is always explicit   */
799 /*!
800 @ingroup WORK_SHARING
801 @param loc  source location information.
802 @param global_tid  global thread number .
803 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
804 */
805 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
806   int status = 0;
807 
808   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
809   __kmp_assert_valid_gtid(global_tid);
810 
811   if (!TCR_4(__kmp_init_parallel))
812     __kmp_parallel_initialize();
813 
814   __kmp_resume_if_soft_paused();
815 
816   if (KMP_MASTER_GTID(global_tid)) {
817     KMP_COUNT_BLOCK(OMP_MASTER);
818     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
819     status = 1;
820   }
821 
822 #if OMPT_SUPPORT && OMPT_OPTIONAL
823   if (status) {
824     if (ompt_enabled.ompt_callback_masked) {
825       kmp_info_t *this_thr = __kmp_threads[global_tid];
826       kmp_team_t *team = this_thr->th.th_team;
827 
828       int tid = __kmp_tid_from_gtid(global_tid);
829       ompt_callbacks.ompt_callback(ompt_callback_masked)(
830           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
831           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
832           OMPT_GET_RETURN_ADDRESS(0));
833     }
834   }
835 #endif
836 
837   if (__kmp_env_consistency_check) {
838 #if KMP_USE_DYNAMIC_LOCK
839     if (status)
840       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
841     else
842       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
843 #else
844     if (status)
845       __kmp_push_sync(global_tid, ct_master, loc, NULL);
846     else
847       __kmp_check_sync(global_tid, ct_master, loc, NULL);
848 #endif
849   }
850 
851   return status;
852 }
853 
854 /*!
855 @ingroup WORK_SHARING
856 @param loc  source location information.
857 @param global_tid  global thread number .
858 
859 Mark the end of a <tt>master</tt> region. This should only be called by the
860 thread that executes the <tt>master</tt> region.
861 */
862 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
863   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
864   __kmp_assert_valid_gtid(global_tid);
865   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
866   KMP_POP_PARTITIONED_TIMER();
867 
868 #if OMPT_SUPPORT && OMPT_OPTIONAL
869   kmp_info_t *this_thr = __kmp_threads[global_tid];
870   kmp_team_t *team = this_thr->th.th_team;
871   if (ompt_enabled.ompt_callback_masked) {
872     int tid = __kmp_tid_from_gtid(global_tid);
873     ompt_callbacks.ompt_callback(ompt_callback_masked)(
874         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
875         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
876         OMPT_GET_RETURN_ADDRESS(0));
877   }
878 #endif
879 
880   if (__kmp_env_consistency_check) {
881     if (KMP_MASTER_GTID(global_tid))
882       __kmp_pop_sync(global_tid, ct_master, loc);
883   }
884 }
885 
886 /*!
887 @ingroup WORK_SHARING
888 @param loc  source location information.
889 @param global_tid  global thread number.
890 @param filter result of evaluating filter clause on thread global_tid, or zero
891 if no filter clause present
892 @return 1 if this thread should execute the <tt>masked</tt> block, 0 otherwise.
893 */
894 kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
895   int status = 0;
896   int tid;
897   KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
898   __kmp_assert_valid_gtid(global_tid);
899 
900   if (!TCR_4(__kmp_init_parallel))
901     __kmp_parallel_initialize();
902 
903   __kmp_resume_if_soft_paused();
904 
905   tid = __kmp_tid_from_gtid(global_tid);
906   if (tid == filter) {
907     KMP_COUNT_BLOCK(OMP_MASKED);
908     KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
909     status = 1;
910   }
911 
912 #if OMPT_SUPPORT && OMPT_OPTIONAL
913   if (status) {
914     if (ompt_enabled.ompt_callback_masked) {
915       kmp_info_t *this_thr = __kmp_threads[global_tid];
916       kmp_team_t *team = this_thr->th.th_team;
917       ompt_callbacks.ompt_callback(ompt_callback_masked)(
918           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
919           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
920           OMPT_GET_RETURN_ADDRESS(0));
921     }
922   }
923 #endif
924 
925   if (__kmp_env_consistency_check) {
926 #if KMP_USE_DYNAMIC_LOCK
927     if (status)
928       __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
929     else
930       __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
931 #else
932     if (status)
933       __kmp_push_sync(global_tid, ct_masked, loc, NULL);
934     else
935       __kmp_check_sync(global_tid, ct_masked, loc, NULL);
936 #endif
937   }
938 
939   return status;
940 }
941 
942 /*!
943 @ingroup WORK_SHARING
944 @param loc  source location information.
945 @param global_tid  global thread number .
946 
947 Mark the end of a <tt>masked</tt> region. This should only be called by the
948 thread that executes the <tt>masked</tt> region.
949 */
950 void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
951   KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
952   __kmp_assert_valid_gtid(global_tid);
953   KMP_POP_PARTITIONED_TIMER();
954 
955 #if OMPT_SUPPORT && OMPT_OPTIONAL
956   kmp_info_t *this_thr = __kmp_threads[global_tid];
957   kmp_team_t *team = this_thr->th.th_team;
958   if (ompt_enabled.ompt_callback_masked) {
959     int tid = __kmp_tid_from_gtid(global_tid);
960     ompt_callbacks.ompt_callback(ompt_callback_masked)(
961         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
962         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
963         OMPT_GET_RETURN_ADDRESS(0));
964   }
965 #endif
966 
967   if (__kmp_env_consistency_check) {
968     __kmp_pop_sync(global_tid, ct_masked, loc);
969   }
970 }
971 
972 /*!
973 @ingroup WORK_SHARING
974 @param loc  source location information.
975 @param gtid  global thread number.
976 
977 Start execution of an <tt>ordered</tt> construct.
978 */
979 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
980   int cid = 0;
981   kmp_info_t *th;
982   KMP_DEBUG_ASSERT(__kmp_init_serial);
983 
984   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
985   __kmp_assert_valid_gtid(gtid);
986 
987   if (!TCR_4(__kmp_init_parallel))
988     __kmp_parallel_initialize();
989 
990   __kmp_resume_if_soft_paused();
991 
992 #if USE_ITT_BUILD
993   __kmp_itt_ordered_prep(gtid);
994 // TODO: ordered_wait_id
995 #endif /* USE_ITT_BUILD */
996 
997   th = __kmp_threads[gtid];
998 
999 #if OMPT_SUPPORT && OMPT_OPTIONAL
1000   kmp_team_t *team;
1001   ompt_wait_id_t lck;
1002   void *codeptr_ra;
1003   OMPT_STORE_RETURN_ADDRESS(gtid);
1004   if (ompt_enabled.enabled) {
1005     team = __kmp_team_from_gtid(gtid);
1006     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
1007     /* OMPT state update */
1008     th->th.ompt_thread_info.wait_id = lck;
1009     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
1010 
1011     /* OMPT event callback */
1012     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1013     if (ompt_enabled.ompt_callback_mutex_acquire) {
1014       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1015           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
1016           codeptr_ra);
1017     }
1018   }
1019 #endif
1020 
1021   if (th->th.th_dispatch->th_deo_fcn != 0)
1022     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
1023   else
1024     __kmp_parallel_deo(&gtid, &cid, loc);
1025 
1026 #if OMPT_SUPPORT && OMPT_OPTIONAL
1027   if (ompt_enabled.enabled) {
1028     /* OMPT state update */
1029     th->th.ompt_thread_info.state = ompt_state_work_parallel;
1030     th->th.ompt_thread_info.wait_id = 0;
1031 
1032     /* OMPT event callback */
1033     if (ompt_enabled.ompt_callback_mutex_acquired) {
1034       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1035           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1036     }
1037   }
1038 #endif
1039 
1040 #if USE_ITT_BUILD
1041   __kmp_itt_ordered_start(gtid);
1042 #endif /* USE_ITT_BUILD */
1043 }
1044 
1045 /*!
1046 @ingroup WORK_SHARING
1047 @param loc  source location information.
1048 @param gtid  global thread number.
1049 
1050 End execution of an <tt>ordered</tt> construct.
1051 */
1052 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
1053   int cid = 0;
1054   kmp_info_t *th;
1055 
1056   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
1057   __kmp_assert_valid_gtid(gtid);
1058 
1059 #if USE_ITT_BUILD
1060   __kmp_itt_ordered_end(gtid);
1061 // TODO: ordered_wait_id
1062 #endif /* USE_ITT_BUILD */
1063 
1064   th = __kmp_threads[gtid];
1065 
1066   if (th->th.th_dispatch->th_dxo_fcn != 0)
1067     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
1068   else
1069     __kmp_parallel_dxo(&gtid, &cid, loc);
1070 
1071 #if OMPT_SUPPORT && OMPT_OPTIONAL
1072   OMPT_STORE_RETURN_ADDRESS(gtid);
1073   if (ompt_enabled.ompt_callback_mutex_released) {
1074     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1075         ompt_mutex_ordered,
1076         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
1077             ->t.t_ordered.dt.t_value,
1078         OMPT_LOAD_RETURN_ADDRESS(gtid));
1079   }
1080 #endif
1081 }
1082 
1083 #if KMP_USE_DYNAMIC_LOCK
1084 
1085 static __forceinline void
1086 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
1087                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
1088   // Pointer to the allocated indirect lock is written to crit, while indexing
1089   // is ignored.
1090   void *idx;
1091   kmp_indirect_lock_t **lck;
1092   lck = (kmp_indirect_lock_t **)crit;
1093   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
1094   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
1095   KMP_SET_I_LOCK_LOCATION(ilk, loc);
1096   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
1097   KA_TRACE(20,
1098            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
1099 #if USE_ITT_BUILD
1100   __kmp_itt_critical_creating(ilk->lock, loc);
1101 #endif
1102   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
1103   if (status == 0) {
1104 #if USE_ITT_BUILD
1105     __kmp_itt_critical_destroyed(ilk->lock);
1106 #endif
1107     // We don't really need to destroy the unclaimed lock here since it will be
1108     // cleaned up at program exit.
1109     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
1110   }
1111   KMP_DEBUG_ASSERT(*lck != NULL);
1112 }
1113 
1114 // Fast-path acquire tas lock
1115 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
1116   {                                                                            \
1117     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1118     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1119     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1120     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
1121         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
1122       kmp_uint32 spins;                                                        \
1123       KMP_FSYNC_PREPARE(l);                                                    \
1124       KMP_INIT_YIELD(spins);                                                   \
1125       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
1126       do {                                                                     \
1127         if (TCR_4(__kmp_nth) >                                                 \
1128             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1129           KMP_YIELD(TRUE);                                                     \
1130         } else {                                                               \
1131           KMP_YIELD_SPIN(spins);                                               \
1132         }                                                                      \
1133         __kmp_spin_backoff(&backoff);                                          \
1134       } while (                                                                \
1135           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1136           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1137     }                                                                          \
1138     KMP_FSYNC_ACQUIRED(l);                                                     \
1139   }
1140 
1141 // Fast-path test tas lock
1142 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1143   {                                                                            \
1144     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1145     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1146     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1147     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1148          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1149   }
1150 
1151 // Fast-path release tas lock
1152 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1153   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1154 
1155 #if KMP_USE_FUTEX
1156 
1157 #include <sys/syscall.h>
1158 #include <unistd.h>
1159 #ifndef FUTEX_WAIT
1160 #define FUTEX_WAIT 0
1161 #endif
1162 #ifndef FUTEX_WAKE
1163 #define FUTEX_WAKE 1
1164 #endif
1165 
1166 // Fast-path acquire futex lock
1167 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1168   {                                                                            \
1169     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1170     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1171     KMP_MB();                                                                  \
1172     KMP_FSYNC_PREPARE(ftx);                                                    \
1173     kmp_int32 poll_val;                                                        \
1174     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1175                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1176                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1177       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1178       if (!cond) {                                                             \
1179         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1180                                          poll_val |                            \
1181                                              KMP_LOCK_BUSY(1, futex))) {       \
1182           continue;                                                            \
1183         }                                                                      \
1184         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1185       }                                                                        \
1186       kmp_int32 rc;                                                            \
1187       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1188                         NULL, NULL, 0)) != 0) {                                \
1189         continue;                                                              \
1190       }                                                                        \
1191       gtid_code |= 1;                                                          \
1192     }                                                                          \
1193     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1194   }
1195 
1196 // Fast-path test futex lock
1197 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1198   {                                                                            \
1199     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1200     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1201                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1202       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1203       rc = TRUE;                                                               \
1204     } else {                                                                   \
1205       rc = FALSE;                                                              \
1206     }                                                                          \
1207   }
1208 
1209 // Fast-path release futex lock
1210 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1211   {                                                                            \
1212     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1213     KMP_MB();                                                                  \
1214     KMP_FSYNC_RELEASING(ftx);                                                  \
1215     kmp_int32 poll_val =                                                       \
1216         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1217     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1218       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1219               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1220     }                                                                          \
1221     KMP_MB();                                                                  \
1222     KMP_YIELD_OVERSUB();                                                       \
1223   }
1224 
1225 #endif // KMP_USE_FUTEX
1226 
1227 #else // KMP_USE_DYNAMIC_LOCK
1228 
1229 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1230                                                       ident_t const *loc,
1231                                                       kmp_int32 gtid) {
1232   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1233 
1234   // Because of the double-check, the following load doesn't need to be volatile
1235   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1236 
1237   if (lck == NULL) {
1238     void *idx;
1239 
1240     // Allocate & initialize the lock.
1241     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1242     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1243     __kmp_init_user_lock_with_checks(lck);
1244     __kmp_set_user_lock_location(lck, loc);
1245 #if USE_ITT_BUILD
1246     __kmp_itt_critical_creating(lck);
1247 // __kmp_itt_critical_creating() should be called *before* the first usage
1248 // of underlying lock. It is the only place where we can guarantee it. There
1249 // are chances the lock will destroyed with no usage, but it is not a
1250 // problem, because this is not real event seen by user but rather setting
1251 // name for object (lock). See more details in kmp_itt.h.
1252 #endif /* USE_ITT_BUILD */
1253 
1254     // Use a cmpxchg instruction to slam the start of the critical section with
1255     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1256     // and use the lock that the other thread allocated.
1257     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1258 
1259     if (status == 0) {
1260 // Deallocate the lock and reload the value.
1261 #if USE_ITT_BUILD
1262       __kmp_itt_critical_destroyed(lck);
1263 // Let ITT know the lock is destroyed and the same memory location may be reused
1264 // for another purpose.
1265 #endif /* USE_ITT_BUILD */
1266       __kmp_destroy_user_lock_with_checks(lck);
1267       __kmp_user_lock_free(&idx, gtid, lck);
1268       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1269       KMP_DEBUG_ASSERT(lck != NULL);
1270     }
1271   }
1272   return lck;
1273 }
1274 
1275 #endif // KMP_USE_DYNAMIC_LOCK
1276 
1277 /*!
1278 @ingroup WORK_SHARING
1279 @param loc  source location information.
1280 @param global_tid  global thread number.
1281 @param crit identity of the critical section. This could be a pointer to a lock
1282 associated with the critical section, or some other suitably unique value.
1283 
1284 Enter code protected by a `critical` construct.
1285 This function blocks until the executing thread can enter the critical section.
1286 */
1287 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1288                      kmp_critical_name *crit) {
1289 #if KMP_USE_DYNAMIC_LOCK
1290 #if OMPT_SUPPORT && OMPT_OPTIONAL
1291   OMPT_STORE_RETURN_ADDRESS(global_tid);
1292 #endif // OMPT_SUPPORT
1293   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1294 #else
1295   KMP_COUNT_BLOCK(OMP_CRITICAL);
1296 #if OMPT_SUPPORT && OMPT_OPTIONAL
1297   ompt_state_t prev_state = ompt_state_undefined;
1298   ompt_thread_info_t ti;
1299 #endif
1300   kmp_user_lock_p lck;
1301 
1302   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1303   __kmp_assert_valid_gtid(global_tid);
1304 
1305   // TODO: add THR_OVHD_STATE
1306 
1307   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1308   KMP_CHECK_USER_LOCK_INIT();
1309 
1310   if ((__kmp_user_lock_kind == lk_tas) &&
1311       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1312     lck = (kmp_user_lock_p)crit;
1313   }
1314 #if KMP_USE_FUTEX
1315   else if ((__kmp_user_lock_kind == lk_futex) &&
1316            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1317     lck = (kmp_user_lock_p)crit;
1318   }
1319 #endif
1320   else { // ticket, queuing or drdpa
1321     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1322   }
1323 
1324   if (__kmp_env_consistency_check)
1325     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1326 
1327     // since the critical directive binds to all threads, not just the current
1328     // team we have to check this even if we are in a serialized team.
1329     // also, even if we are the uber thread, we still have to conduct the lock,
1330     // as we have to contend with sibling threads.
1331 
1332 #if USE_ITT_BUILD
1333   __kmp_itt_critical_acquiring(lck);
1334 #endif /* USE_ITT_BUILD */
1335 #if OMPT_SUPPORT && OMPT_OPTIONAL
1336   OMPT_STORE_RETURN_ADDRESS(gtid);
1337   void *codeptr_ra = NULL;
1338   if (ompt_enabled.enabled) {
1339     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1340     /* OMPT state update */
1341     prev_state = ti.state;
1342     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1343     ti.state = ompt_state_wait_critical;
1344 
1345     /* OMPT event callback */
1346     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1347     if (ompt_enabled.ompt_callback_mutex_acquire) {
1348       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1349           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1350           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1351     }
1352   }
1353 #endif
1354   // Value of 'crit' should be good for using as a critical_id of the critical
1355   // section directive.
1356   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1357 
1358 #if USE_ITT_BUILD
1359   __kmp_itt_critical_acquired(lck);
1360 #endif /* USE_ITT_BUILD */
1361 #if OMPT_SUPPORT && OMPT_OPTIONAL
1362   if (ompt_enabled.enabled) {
1363     /* OMPT state update */
1364     ti.state = prev_state;
1365     ti.wait_id = 0;
1366 
1367     /* OMPT event callback */
1368     if (ompt_enabled.ompt_callback_mutex_acquired) {
1369       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1370           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1371     }
1372   }
1373 #endif
1374   KMP_POP_PARTITIONED_TIMER();
1375 
1376   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1377   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1378 #endif // KMP_USE_DYNAMIC_LOCK
1379 }
1380 
1381 #if KMP_USE_DYNAMIC_LOCK
1382 
1383 // Converts the given hint to an internal lock implementation
1384 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1385 #if KMP_USE_TSX
1386 #define KMP_TSX_LOCK(seq) lockseq_##seq
1387 #else
1388 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1389 #endif
1390 
1391 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1392 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.flags.rtm)
1393 #else
1394 #define KMP_CPUINFO_RTM 0
1395 #endif
1396 
1397   // Hints that do not require further logic
1398   if (hint & kmp_lock_hint_hle)
1399     return KMP_TSX_LOCK(hle);
1400   if (hint & kmp_lock_hint_rtm)
1401     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
1402   if (hint & kmp_lock_hint_adaptive)
1403     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1404 
1405   // Rule out conflicting hints first by returning the default lock
1406   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1407     return __kmp_user_lock_seq;
1408   if ((hint & omp_lock_hint_speculative) &&
1409       (hint & omp_lock_hint_nonspeculative))
1410     return __kmp_user_lock_seq;
1411 
1412   // Do not even consider speculation when it appears to be contended
1413   if (hint & omp_lock_hint_contended)
1414     return lockseq_queuing;
1415 
1416   // Uncontended lock without speculation
1417   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1418     return lockseq_tas;
1419 
1420   // Use RTM lock for speculation
1421   if (hint & omp_lock_hint_speculative)
1422     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
1423 
1424   return __kmp_user_lock_seq;
1425 }
1426 
1427 #if OMPT_SUPPORT && OMPT_OPTIONAL
1428 #if KMP_USE_DYNAMIC_LOCK
1429 static kmp_mutex_impl_t
1430 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1431   if (user_lock) {
1432     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1433     case 0:
1434       break;
1435 #if KMP_USE_FUTEX
1436     case locktag_futex:
1437       return kmp_mutex_impl_queuing;
1438 #endif
1439     case locktag_tas:
1440       return kmp_mutex_impl_spin;
1441 #if KMP_USE_TSX
1442     case locktag_hle:
1443     case locktag_rtm_spin:
1444       return kmp_mutex_impl_speculative;
1445 #endif
1446     default:
1447       return kmp_mutex_impl_none;
1448     }
1449     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1450   }
1451   KMP_ASSERT(ilock);
1452   switch (ilock->type) {
1453 #if KMP_USE_TSX
1454   case locktag_adaptive:
1455   case locktag_rtm_queuing:
1456     return kmp_mutex_impl_speculative;
1457 #endif
1458   case locktag_nested_tas:
1459     return kmp_mutex_impl_spin;
1460 #if KMP_USE_FUTEX
1461   case locktag_nested_futex:
1462 #endif
1463   case locktag_ticket:
1464   case locktag_queuing:
1465   case locktag_drdpa:
1466   case locktag_nested_ticket:
1467   case locktag_nested_queuing:
1468   case locktag_nested_drdpa:
1469     return kmp_mutex_impl_queuing;
1470   default:
1471     return kmp_mutex_impl_none;
1472   }
1473 }
1474 #else
1475 // For locks without dynamic binding
1476 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1477   switch (__kmp_user_lock_kind) {
1478   case lk_tas:
1479     return kmp_mutex_impl_spin;
1480 #if KMP_USE_FUTEX
1481   case lk_futex:
1482 #endif
1483   case lk_ticket:
1484   case lk_queuing:
1485   case lk_drdpa:
1486     return kmp_mutex_impl_queuing;
1487 #if KMP_USE_TSX
1488   case lk_hle:
1489   case lk_rtm_queuing:
1490   case lk_rtm_spin:
1491   case lk_adaptive:
1492     return kmp_mutex_impl_speculative;
1493 #endif
1494   default:
1495     return kmp_mutex_impl_none;
1496   }
1497 }
1498 #endif // KMP_USE_DYNAMIC_LOCK
1499 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1500 
1501 /*!
1502 @ingroup WORK_SHARING
1503 @param loc  source location information.
1504 @param global_tid  global thread number.
1505 @param crit identity of the critical section. This could be a pointer to a lock
1506 associated with the critical section, or some other suitably unique value.
1507 @param hint the lock hint.
1508 
1509 Enter code protected by a `critical` construct with a hint. The hint value is
1510 used to suggest a lock implementation. This function blocks until the executing
1511 thread can enter the critical section unless the hint suggests use of
1512 speculative execution and the hardware supports it.
1513 */
1514 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1515                                kmp_critical_name *crit, uint32_t hint) {
1516   KMP_COUNT_BLOCK(OMP_CRITICAL);
1517   kmp_user_lock_p lck;
1518 #if OMPT_SUPPORT && OMPT_OPTIONAL
1519   ompt_state_t prev_state = ompt_state_undefined;
1520   ompt_thread_info_t ti;
1521   // This is the case, if called from __kmpc_critical:
1522   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1523   if (!codeptr)
1524     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1525 #endif
1526 
1527   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1528   __kmp_assert_valid_gtid(global_tid);
1529 
1530   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1531   // Check if it is initialized.
1532   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1533   kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
1534   if (*lk == 0) {
1535     if (KMP_IS_D_LOCK(lockseq)) {
1536       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1537                                   KMP_GET_D_TAG(lockseq));
1538     } else {
1539       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
1540     }
1541   }
1542   // Branch for accessing the actual lock object and set operation. This
1543   // branching is inevitable since this lock initialization does not follow the
1544   // normal dispatch path (lock table is not used).
1545   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1546     lck = (kmp_user_lock_p)lk;
1547     if (__kmp_env_consistency_check) {
1548       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1549                       __kmp_map_hint_to_lock(hint));
1550     }
1551 #if USE_ITT_BUILD
1552     __kmp_itt_critical_acquiring(lck);
1553 #endif
1554 #if OMPT_SUPPORT && OMPT_OPTIONAL
1555     if (ompt_enabled.enabled) {
1556       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1557       /* OMPT state update */
1558       prev_state = ti.state;
1559       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1560       ti.state = ompt_state_wait_critical;
1561 
1562       /* OMPT event callback */
1563       if (ompt_enabled.ompt_callback_mutex_acquire) {
1564         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1565             ompt_mutex_critical, (unsigned int)hint,
1566             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1567             codeptr);
1568       }
1569     }
1570 #endif
1571 #if KMP_USE_INLINED_TAS
1572     if (lockseq == lockseq_tas && !__kmp_env_consistency_check) {
1573       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1574     } else
1575 #elif KMP_USE_INLINED_FUTEX
1576     if (lockseq == lockseq_futex && !__kmp_env_consistency_check) {
1577       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1578     } else
1579 #endif
1580     {
1581       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1582     }
1583   } else {
1584     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1585     lck = ilk->lock;
1586     if (__kmp_env_consistency_check) {
1587       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1588                       __kmp_map_hint_to_lock(hint));
1589     }
1590 #if USE_ITT_BUILD
1591     __kmp_itt_critical_acquiring(lck);
1592 #endif
1593 #if OMPT_SUPPORT && OMPT_OPTIONAL
1594     if (ompt_enabled.enabled) {
1595       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1596       /* OMPT state update */
1597       prev_state = ti.state;
1598       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1599       ti.state = ompt_state_wait_critical;
1600 
1601       /* OMPT event callback */
1602       if (ompt_enabled.ompt_callback_mutex_acquire) {
1603         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1604             ompt_mutex_critical, (unsigned int)hint,
1605             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1606             codeptr);
1607       }
1608     }
1609 #endif
1610     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1611   }
1612   KMP_POP_PARTITIONED_TIMER();
1613 
1614 #if USE_ITT_BUILD
1615   __kmp_itt_critical_acquired(lck);
1616 #endif /* USE_ITT_BUILD */
1617 #if OMPT_SUPPORT && OMPT_OPTIONAL
1618   if (ompt_enabled.enabled) {
1619     /* OMPT state update */
1620     ti.state = prev_state;
1621     ti.wait_id = 0;
1622 
1623     /* OMPT event callback */
1624     if (ompt_enabled.ompt_callback_mutex_acquired) {
1625       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1626           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1627     }
1628   }
1629 #endif
1630 
1631   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1632   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1633 } // __kmpc_critical_with_hint
1634 
1635 #endif // KMP_USE_DYNAMIC_LOCK
1636 
1637 /*!
1638 @ingroup WORK_SHARING
1639 @param loc  source location information.
1640 @param global_tid  global thread number .
1641 @param crit identity of the critical section. This could be a pointer to a lock
1642 associated with the critical section, or some other suitably unique value.
1643 
1644 Leave a critical section, releasing any lock that was held during its execution.
1645 */
1646 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1647                          kmp_critical_name *crit) {
1648   kmp_user_lock_p lck;
1649 
1650   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1651 
1652 #if KMP_USE_DYNAMIC_LOCK
1653   int locktag = KMP_EXTRACT_D_TAG(crit);
1654   if (locktag) {
1655     lck = (kmp_user_lock_p)crit;
1656     KMP_ASSERT(lck != NULL);
1657     if (__kmp_env_consistency_check) {
1658       __kmp_pop_sync(global_tid, ct_critical, loc);
1659     }
1660 #if USE_ITT_BUILD
1661     __kmp_itt_critical_releasing(lck);
1662 #endif
1663 #if KMP_USE_INLINED_TAS
1664     if (locktag == locktag_tas && !__kmp_env_consistency_check) {
1665       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1666     } else
1667 #elif KMP_USE_INLINED_FUTEX
1668     if (locktag == locktag_futex && !__kmp_env_consistency_check) {
1669       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1670     } else
1671 #endif
1672     {
1673       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1674     }
1675   } else {
1676     kmp_indirect_lock_t *ilk =
1677         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1678     KMP_ASSERT(ilk != NULL);
1679     lck = ilk->lock;
1680     if (__kmp_env_consistency_check) {
1681       __kmp_pop_sync(global_tid, ct_critical, loc);
1682     }
1683 #if USE_ITT_BUILD
1684     __kmp_itt_critical_releasing(lck);
1685 #endif
1686     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1687   }
1688 
1689 #else // KMP_USE_DYNAMIC_LOCK
1690 
1691   if ((__kmp_user_lock_kind == lk_tas) &&
1692       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1693     lck = (kmp_user_lock_p)crit;
1694   }
1695 #if KMP_USE_FUTEX
1696   else if ((__kmp_user_lock_kind == lk_futex) &&
1697            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1698     lck = (kmp_user_lock_p)crit;
1699   }
1700 #endif
1701   else { // ticket, queuing or drdpa
1702     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1703   }
1704 
1705   KMP_ASSERT(lck != NULL);
1706 
1707   if (__kmp_env_consistency_check)
1708     __kmp_pop_sync(global_tid, ct_critical, loc);
1709 
1710 #if USE_ITT_BUILD
1711   __kmp_itt_critical_releasing(lck);
1712 #endif /* USE_ITT_BUILD */
1713   // Value of 'crit' should be good for using as a critical_id of the critical
1714   // section directive.
1715   __kmp_release_user_lock_with_checks(lck, global_tid);
1716 
1717 #endif // KMP_USE_DYNAMIC_LOCK
1718 
1719 #if OMPT_SUPPORT && OMPT_OPTIONAL
1720   /* OMPT release event triggers after lock is released; place here to trigger
1721    * for all #if branches */
1722   OMPT_STORE_RETURN_ADDRESS(global_tid);
1723   if (ompt_enabled.ompt_callback_mutex_released) {
1724     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1725         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1726         OMPT_LOAD_RETURN_ADDRESS(0));
1727   }
1728 #endif
1729 
1730   KMP_POP_PARTITIONED_TIMER();
1731   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1732 }
1733 
1734 /*!
1735 @ingroup SYNCHRONIZATION
1736 @param loc source location information
1737 @param global_tid thread id.
1738 @return one if the thread should execute the master block, zero otherwise
1739 
1740 Start execution of a combined barrier and master. The barrier is executed inside
1741 this function.
1742 */
1743 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1744   int status;
1745   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1746   __kmp_assert_valid_gtid(global_tid);
1747 
1748   if (!TCR_4(__kmp_init_parallel))
1749     __kmp_parallel_initialize();
1750 
1751   __kmp_resume_if_soft_paused();
1752 
1753   if (__kmp_env_consistency_check)
1754     __kmp_check_barrier(global_tid, ct_barrier, loc);
1755 
1756 #if OMPT_SUPPORT
1757   ompt_frame_t *ompt_frame;
1758   if (ompt_enabled.enabled) {
1759     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1760     if (ompt_frame->enter_frame.ptr == NULL)
1761       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1762   }
1763   OMPT_STORE_RETURN_ADDRESS(global_tid);
1764 #endif
1765 #if USE_ITT_NOTIFY
1766   __kmp_threads[global_tid]->th.th_ident = loc;
1767 #endif
1768   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1769 #if OMPT_SUPPORT && OMPT_OPTIONAL
1770   if (ompt_enabled.enabled) {
1771     ompt_frame->enter_frame = ompt_data_none;
1772   }
1773 #endif
1774 
1775   return (status != 0) ? 0 : 1;
1776 }
1777 
1778 /*!
1779 @ingroup SYNCHRONIZATION
1780 @param loc source location information
1781 @param global_tid thread id.
1782 
1783 Complete the execution of a combined barrier and master. This function should
1784 only be called at the completion of the <tt>master</tt> code. Other threads will
1785 still be waiting at the barrier and this call releases them.
1786 */
1787 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1788   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1789   __kmp_assert_valid_gtid(global_tid);
1790   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1791 }
1792 
1793 /*!
1794 @ingroup SYNCHRONIZATION
1795 @param loc source location information
1796 @param global_tid thread id.
1797 @return one if the thread should execute the master block, zero otherwise
1798 
1799 Start execution of a combined barrier and master(nowait) construct.
1800 The barrier is executed inside this function.
1801 There is no equivalent "end" function, since the
1802 */
1803 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1804   kmp_int32 ret;
1805   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1806   __kmp_assert_valid_gtid(global_tid);
1807 
1808   if (!TCR_4(__kmp_init_parallel))
1809     __kmp_parallel_initialize();
1810 
1811   __kmp_resume_if_soft_paused();
1812 
1813   if (__kmp_env_consistency_check) {
1814     if (loc == 0) {
1815       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1816     }
1817     __kmp_check_barrier(global_tid, ct_barrier, loc);
1818   }
1819 
1820 #if OMPT_SUPPORT
1821   ompt_frame_t *ompt_frame;
1822   if (ompt_enabled.enabled) {
1823     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1824     if (ompt_frame->enter_frame.ptr == NULL)
1825       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1826   }
1827   OMPT_STORE_RETURN_ADDRESS(global_tid);
1828 #endif
1829 #if USE_ITT_NOTIFY
1830   __kmp_threads[global_tid]->th.th_ident = loc;
1831 #endif
1832   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1833 #if OMPT_SUPPORT && OMPT_OPTIONAL
1834   if (ompt_enabled.enabled) {
1835     ompt_frame->enter_frame = ompt_data_none;
1836   }
1837 #endif
1838 
1839   ret = __kmpc_master(loc, global_tid);
1840 
1841   if (__kmp_env_consistency_check) {
1842     /*  there's no __kmpc_end_master called; so the (stats) */
1843     /*  actions of __kmpc_end_master are done here          */
1844     if (ret) {
1845       /* only one thread should do the pop since only */
1846       /* one did the push (see __kmpc_master())       */
1847       __kmp_pop_sync(global_tid, ct_master, loc);
1848     }
1849   }
1850 
1851   return (ret);
1852 }
1853 
1854 /* The BARRIER for a SINGLE process section is always explicit   */
1855 /*!
1856 @ingroup WORK_SHARING
1857 @param loc  source location information
1858 @param global_tid  global thread number
1859 @return One if this thread should execute the single construct, zero otherwise.
1860 
1861 Test whether to execute a <tt>single</tt> construct.
1862 There are no implicit barriers in the two "single" calls, rather the compiler
1863 should introduce an explicit barrier if it is required.
1864 */
1865 
1866 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1867   __kmp_assert_valid_gtid(global_tid);
1868   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1869 
1870   if (rc) {
1871     // We are going to execute the single statement, so we should count it.
1872     KMP_COUNT_BLOCK(OMP_SINGLE);
1873     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1874   }
1875 
1876 #if OMPT_SUPPORT && OMPT_OPTIONAL
1877   kmp_info_t *this_thr = __kmp_threads[global_tid];
1878   kmp_team_t *team = this_thr->th.th_team;
1879   int tid = __kmp_tid_from_gtid(global_tid);
1880 
1881   if (ompt_enabled.enabled) {
1882     if (rc) {
1883       if (ompt_enabled.ompt_callback_work) {
1884         ompt_callbacks.ompt_callback(ompt_callback_work)(
1885             ompt_work_single_executor, ompt_scope_begin,
1886             &(team->t.ompt_team_info.parallel_data),
1887             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1888             1, OMPT_GET_RETURN_ADDRESS(0));
1889       }
1890     } else {
1891       if (ompt_enabled.ompt_callback_work) {
1892         ompt_callbacks.ompt_callback(ompt_callback_work)(
1893             ompt_work_single_other, ompt_scope_begin,
1894             &(team->t.ompt_team_info.parallel_data),
1895             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1896             1, OMPT_GET_RETURN_ADDRESS(0));
1897         ompt_callbacks.ompt_callback(ompt_callback_work)(
1898             ompt_work_single_other, ompt_scope_end,
1899             &(team->t.ompt_team_info.parallel_data),
1900             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1901             1, OMPT_GET_RETURN_ADDRESS(0));
1902       }
1903     }
1904   }
1905 #endif
1906 
1907   return rc;
1908 }
1909 
1910 /*!
1911 @ingroup WORK_SHARING
1912 @param loc  source location information
1913 @param global_tid  global thread number
1914 
1915 Mark the end of a <tt>single</tt> construct.  This function should
1916 only be called by the thread that executed the block of code protected
1917 by the `single` construct.
1918 */
1919 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1920   __kmp_assert_valid_gtid(global_tid);
1921   __kmp_exit_single(global_tid);
1922   KMP_POP_PARTITIONED_TIMER();
1923 
1924 #if OMPT_SUPPORT && OMPT_OPTIONAL
1925   kmp_info_t *this_thr = __kmp_threads[global_tid];
1926   kmp_team_t *team = this_thr->th.th_team;
1927   int tid = __kmp_tid_from_gtid(global_tid);
1928 
1929   if (ompt_enabled.ompt_callback_work) {
1930     ompt_callbacks.ompt_callback(ompt_callback_work)(
1931         ompt_work_single_executor, ompt_scope_end,
1932         &(team->t.ompt_team_info.parallel_data),
1933         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1934         OMPT_GET_RETURN_ADDRESS(0));
1935   }
1936 #endif
1937 }
1938 
1939 /*!
1940 @ingroup WORK_SHARING
1941 @param loc Source location
1942 @param global_tid Global thread id
1943 
1944 Mark the end of a statically scheduled loop.
1945 */
1946 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1947   KMP_POP_PARTITIONED_TIMER();
1948   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1949 
1950 #if OMPT_SUPPORT && OMPT_OPTIONAL
1951   if (ompt_enabled.ompt_callback_work) {
1952     ompt_work_t ompt_work_type = ompt_work_loop;
1953     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1954     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1955     // Determine workshare type
1956     if (loc != NULL) {
1957       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1958         ompt_work_type = ompt_work_loop;
1959       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1960         ompt_work_type = ompt_work_sections;
1961       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1962         ompt_work_type = ompt_work_distribute;
1963       } else {
1964         // use default set above.
1965         // a warning about this case is provided in __kmpc_for_static_init
1966       }
1967       KMP_DEBUG_ASSERT(ompt_work_type);
1968     }
1969     ompt_callbacks.ompt_callback(ompt_callback_work)(
1970         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1971         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1972   }
1973 #endif
1974   if (__kmp_env_consistency_check)
1975     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1976 }
1977 
1978 // User routines which take C-style arguments (call by value)
1979 // different from the Fortran equivalent routines
1980 
1981 void ompc_set_num_threads(int arg) {
1982   // !!!!! TODO: check the per-task binding
1983   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1984 }
1985 
1986 void ompc_set_dynamic(int flag) {
1987   kmp_info_t *thread;
1988 
1989   /* For the thread-private implementation of the internal controls */
1990   thread = __kmp_entry_thread();
1991 
1992   __kmp_save_internal_controls(thread);
1993 
1994   set__dynamic(thread, flag ? true : false);
1995 }
1996 
1997 void ompc_set_nested(int flag) {
1998   kmp_info_t *thread;
1999 
2000   /* For the thread-private internal controls implementation */
2001   thread = __kmp_entry_thread();
2002 
2003   __kmp_save_internal_controls(thread);
2004 
2005   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
2006 }
2007 
2008 void ompc_set_max_active_levels(int max_active_levels) {
2009   /* TO DO */
2010   /* we want per-task implementation of this internal control */
2011 
2012   /* For the per-thread internal controls implementation */
2013   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
2014 }
2015 
2016 void ompc_set_schedule(omp_sched_t kind, int modifier) {
2017   // !!!!! TODO: check the per-task binding
2018   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
2019 }
2020 
2021 int ompc_get_ancestor_thread_num(int level) {
2022   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
2023 }
2024 
2025 int ompc_get_team_size(int level) {
2026   return __kmp_get_team_size(__kmp_entry_gtid(), level);
2027 }
2028 
2029 /* OpenMP 5.0 Affinity Format API */
2030 void KMP_EXPAND_NAME(ompc_set_affinity_format)(char const *format) {
2031   if (!__kmp_init_serial) {
2032     __kmp_serial_initialize();
2033   }
2034   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
2035                          format, KMP_STRLEN(format) + 1);
2036 }
2037 
2038 size_t KMP_EXPAND_NAME(ompc_get_affinity_format)(char *buffer, size_t size) {
2039   size_t format_size;
2040   if (!__kmp_init_serial) {
2041     __kmp_serial_initialize();
2042   }
2043   format_size = KMP_STRLEN(__kmp_affinity_format);
2044   if (buffer && size) {
2045     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
2046                            format_size + 1);
2047   }
2048   return format_size;
2049 }
2050 
2051 void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format) {
2052   int gtid;
2053   if (!TCR_4(__kmp_init_middle)) {
2054     __kmp_middle_initialize();
2055   }
2056   __kmp_assign_root_init_mask();
2057   gtid = __kmp_get_gtid();
2058 #if KMP_AFFINITY_SUPPORTED
2059   if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
2060       __kmp_affinity.flags.reset) {
2061     __kmp_reset_root_init_mask(gtid);
2062   }
2063 #endif
2064   __kmp_aux_display_affinity(gtid, format);
2065 }
2066 
2067 size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size,
2068                                               char const *format) {
2069   int gtid;
2070   size_t num_required;
2071   kmp_str_buf_t capture_buf;
2072   if (!TCR_4(__kmp_init_middle)) {
2073     __kmp_middle_initialize();
2074   }
2075   __kmp_assign_root_init_mask();
2076   gtid = __kmp_get_gtid();
2077 #if KMP_AFFINITY_SUPPORTED
2078   if (__kmp_threads[gtid]->th.th_team->t.t_level == 0 &&
2079       __kmp_affinity.flags.reset) {
2080     __kmp_reset_root_init_mask(gtid);
2081   }
2082 #endif
2083   __kmp_str_buf_init(&capture_buf);
2084   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
2085   if (buffer && buf_size) {
2086     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
2087                            capture_buf.used + 1);
2088   }
2089   __kmp_str_buf_free(&capture_buf);
2090   return num_required;
2091 }
2092 
2093 void kmpc_set_stacksize(int arg) {
2094   // __kmp_aux_set_stacksize initializes the library if needed
2095   __kmp_aux_set_stacksize(arg);
2096 }
2097 
2098 void kmpc_set_stacksize_s(size_t arg) {
2099   // __kmp_aux_set_stacksize initializes the library if needed
2100   __kmp_aux_set_stacksize(arg);
2101 }
2102 
2103 void kmpc_set_blocktime(int arg) {
2104   int gtid, tid, bt = arg;
2105   kmp_info_t *thread;
2106 
2107   gtid = __kmp_entry_gtid();
2108   tid = __kmp_tid_from_gtid(gtid);
2109   thread = __kmp_thread_from_gtid(gtid);
2110 
2111   __kmp_aux_convert_blocktime(&bt);
2112   __kmp_aux_set_blocktime(bt, thread, tid);
2113 }
2114 
2115 void kmpc_set_library(int arg) {
2116   // __kmp_user_set_library initializes the library if needed
2117   __kmp_user_set_library((enum library_type)arg);
2118 }
2119 
2120 void kmpc_set_defaults(char const *str) {
2121   // __kmp_aux_set_defaults initializes the library if needed
2122   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
2123 }
2124 
2125 void kmpc_set_disp_num_buffers(int arg) {
2126   // ignore after initialization because some teams have already
2127   // allocated dispatch buffers
2128   if (__kmp_init_serial == FALSE && arg >= KMP_MIN_DISP_NUM_BUFF &&
2129       arg <= KMP_MAX_DISP_NUM_BUFF) {
2130     __kmp_dispatch_num_buffers = arg;
2131   }
2132 }
2133 
2134 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
2135 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2136   return -1;
2137 #else
2138   if (!TCR_4(__kmp_init_middle)) {
2139     __kmp_middle_initialize();
2140   }
2141   __kmp_assign_root_init_mask();
2142   return __kmp_aux_set_affinity_mask_proc(proc, mask);
2143 #endif
2144 }
2145 
2146 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2147 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2148   return -1;
2149 #else
2150   if (!TCR_4(__kmp_init_middle)) {
2151     __kmp_middle_initialize();
2152   }
2153   __kmp_assign_root_init_mask();
2154   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2155 #endif
2156 }
2157 
2158 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2159 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2160   return -1;
2161 #else
2162   if (!TCR_4(__kmp_init_middle)) {
2163     __kmp_middle_initialize();
2164   }
2165   __kmp_assign_root_init_mask();
2166   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2167 #endif
2168 }
2169 
2170 /* -------------------------------------------------------------------------- */
2171 /*!
2172 @ingroup THREADPRIVATE
2173 @param loc       source location information
2174 @param gtid      global thread number
2175 @param cpy_size  size of the cpy_data buffer
2176 @param cpy_data  pointer to data to be copied
2177 @param cpy_func  helper function to call for copying data
2178 @param didit     flag variable: 1=single thread; 0=not single thread
2179 
2180 __kmpc_copyprivate implements the interface for the private data broadcast
2181 needed for the copyprivate clause associated with a single region in an
2182 OpenMP<sup>*</sup> program (both C and Fortran).
2183 All threads participating in the parallel region call this routine.
2184 One of the threads (called the single thread) should have the <tt>didit</tt>
2185 variable set to 1 and all other threads should have that variable set to 0.
2186 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2187 
2188 The OpenMP specification forbids the use of nowait on the single region when a
2189 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2190 barrier internally to avoid race conditions, so the code generation for the
2191 single region should avoid generating a barrier after the call to @ref
2192 __kmpc_copyprivate.
2193 
2194 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2195 The <tt>loc</tt> parameter is a pointer to source location information.
2196 
2197 Internal implementation: The single thread will first copy its descriptor
2198 address (cpy_data) to a team-private location, then the other threads will each
2199 call the function pointed to by the parameter cpy_func, which carries out the
2200 copy by copying the data using the cpy_data buffer.
2201 
2202 The cpy_func routine used for the copy and the contents of the data area defined
2203 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2204 to be done. For instance, the cpy_data buffer can hold the actual data to be
2205 copied or it may hold a list of pointers to the data. The cpy_func routine must
2206 interpret the cpy_data buffer appropriately.
2207 
2208 The interface to cpy_func is as follows:
2209 @code
2210 void cpy_func( void *destination, void *source )
2211 @endcode
2212 where void *destination is the cpy_data pointer for the thread being copied to
2213 and void *source is the cpy_data pointer for the thread being copied from.
2214 */
2215 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2216                         void *cpy_data, void (*cpy_func)(void *, void *),
2217                         kmp_int32 didit) {
2218   void **data_ptr;
2219   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2220   __kmp_assert_valid_gtid(gtid);
2221 
2222   KMP_MB();
2223 
2224   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2225 
2226   if (__kmp_env_consistency_check) {
2227     if (loc == 0) {
2228       KMP_WARNING(ConstructIdentInvalid);
2229     }
2230   }
2231 
2232   // ToDo: Optimize the following two barriers into some kind of split barrier
2233 
2234   if (didit)
2235     *data_ptr = cpy_data;
2236 
2237 #if OMPT_SUPPORT
2238   ompt_frame_t *ompt_frame;
2239   if (ompt_enabled.enabled) {
2240     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2241     if (ompt_frame->enter_frame.ptr == NULL)
2242       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2243   }
2244   OMPT_STORE_RETURN_ADDRESS(gtid);
2245 #endif
2246 /* This barrier is not a barrier region boundary */
2247 #if USE_ITT_NOTIFY
2248   __kmp_threads[gtid]->th.th_ident = loc;
2249 #endif
2250   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2251 
2252   if (!didit)
2253     (*cpy_func)(cpy_data, *data_ptr);
2254 
2255   // Consider next barrier a user-visible barrier for barrier region boundaries
2256   // Nesting checks are already handled by the single construct checks
2257   {
2258 #if OMPT_SUPPORT
2259     OMPT_STORE_RETURN_ADDRESS(gtid);
2260 #endif
2261 #if USE_ITT_NOTIFY
2262     __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2263 // tasks can overwrite the location)
2264 #endif
2265     __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2266 #if OMPT_SUPPORT && OMPT_OPTIONAL
2267     if (ompt_enabled.enabled) {
2268       ompt_frame->enter_frame = ompt_data_none;
2269     }
2270 #endif
2271   }
2272 }
2273 
2274 /* --------------------------------------------------------------------------*/
2275 /*!
2276 @ingroup THREADPRIVATE
2277 @param loc       source location information
2278 @param gtid      global thread number
2279 @param cpy_data  pointer to the data to be saved/copied or 0
2280 @return          the saved pointer to the data
2281 
2282 __kmpc_copyprivate_light is a lighter version of __kmpc_copyprivate:
2283 __kmpc_copyprivate_light only saves the pointer it's given (if it's not 0, so
2284 coming from single), and returns that pointer in all calls (for single thread
2285 it's not needed). This version doesn't do any actual data copying. Data copying
2286 has to be done somewhere else, e.g. inline in the generated code. Due to this,
2287 this function doesn't have any barrier at the end of the function, like
2288 __kmpc_copyprivate does, so generated code needs barrier after copying of all
2289 data was done.
2290 */
2291 void *__kmpc_copyprivate_light(ident_t *loc, kmp_int32 gtid, void *cpy_data) {
2292   void **data_ptr;
2293 
2294   KC_TRACE(10, ("__kmpc_copyprivate_light: called T#%d\n", gtid));
2295 
2296   KMP_MB();
2297 
2298   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2299 
2300   if (__kmp_env_consistency_check) {
2301     if (loc == 0) {
2302       KMP_WARNING(ConstructIdentInvalid);
2303     }
2304   }
2305 
2306   // ToDo: Optimize the following barrier
2307 
2308   if (cpy_data)
2309     *data_ptr = cpy_data;
2310 
2311 #if OMPT_SUPPORT
2312   ompt_frame_t *ompt_frame;
2313   if (ompt_enabled.enabled) {
2314     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2315     if (ompt_frame->enter_frame.ptr == NULL)
2316       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2317     OMPT_STORE_RETURN_ADDRESS(gtid);
2318   }
2319 #endif
2320 /* This barrier is not a barrier region boundary */
2321 #if USE_ITT_NOTIFY
2322   __kmp_threads[gtid]->th.th_ident = loc;
2323 #endif
2324   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2325 
2326   return *data_ptr;
2327 }
2328 
2329 /* -------------------------------------------------------------------------- */
2330 
2331 #define INIT_LOCK __kmp_init_user_lock_with_checks
2332 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2333 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2334 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2335 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2336 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2337   __kmp_acquire_nested_user_lock_with_checks_timed
2338 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2339 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2340 #define TEST_LOCK __kmp_test_user_lock_with_checks
2341 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2342 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2343 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2344 
2345 // TODO: Make check abort messages use location info & pass it into
2346 // with_checks routines
2347 
2348 #if KMP_USE_DYNAMIC_LOCK
2349 
2350 // internal lock initializer
2351 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2352                                                     kmp_dyna_lockseq_t seq) {
2353   if (KMP_IS_D_LOCK(seq)) {
2354     KMP_INIT_D_LOCK(lock, seq);
2355 #if USE_ITT_BUILD
2356     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2357 #endif
2358   } else {
2359     KMP_INIT_I_LOCK(lock, seq);
2360 #if USE_ITT_BUILD
2361     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2362     __kmp_itt_lock_creating(ilk->lock, loc);
2363 #endif
2364   }
2365 }
2366 
2367 // internal nest lock initializer
2368 static __forceinline void
2369 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2370                                kmp_dyna_lockseq_t seq) {
2371 #if KMP_USE_TSX
2372   // Don't have nested lock implementation for speculative locks
2373   if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
2374       seq == lockseq_rtm_spin || seq == lockseq_adaptive)
2375     seq = __kmp_user_lock_seq;
2376 #endif
2377   switch (seq) {
2378   case lockseq_tas:
2379     seq = lockseq_nested_tas;
2380     break;
2381 #if KMP_USE_FUTEX
2382   case lockseq_futex:
2383     seq = lockseq_nested_futex;
2384     break;
2385 #endif
2386   case lockseq_ticket:
2387     seq = lockseq_nested_ticket;
2388     break;
2389   case lockseq_queuing:
2390     seq = lockseq_nested_queuing;
2391     break;
2392   case lockseq_drdpa:
2393     seq = lockseq_nested_drdpa;
2394     break;
2395   default:
2396     seq = lockseq_nested_queuing;
2397   }
2398   KMP_INIT_I_LOCK(lock, seq);
2399 #if USE_ITT_BUILD
2400   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2401   __kmp_itt_lock_creating(ilk->lock, loc);
2402 #endif
2403 }
2404 
2405 /* initialize the lock with a hint */
2406 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2407                                 uintptr_t hint) {
2408   KMP_DEBUG_ASSERT(__kmp_init_serial);
2409   if (__kmp_env_consistency_check && user_lock == NULL) {
2410     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2411   }
2412 
2413   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2414 
2415 #if OMPT_SUPPORT && OMPT_OPTIONAL
2416   // This is the case, if called from omp_init_lock_with_hint:
2417   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2418   if (!codeptr)
2419     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2420   if (ompt_enabled.ompt_callback_lock_init) {
2421     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2422         ompt_mutex_lock, (omp_lock_hint_t)hint,
2423         __ompt_get_mutex_impl_type(user_lock),
2424         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2425   }
2426 #endif
2427 }
2428 
2429 /* initialize the lock with a hint */
2430 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2431                                      void **user_lock, uintptr_t hint) {
2432   KMP_DEBUG_ASSERT(__kmp_init_serial);
2433   if (__kmp_env_consistency_check && user_lock == NULL) {
2434     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2435   }
2436 
2437   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2438 
2439 #if OMPT_SUPPORT && OMPT_OPTIONAL
2440   // This is the case, if called from omp_init_lock_with_hint:
2441   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2442   if (!codeptr)
2443     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2444   if (ompt_enabled.ompt_callback_lock_init) {
2445     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2446         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2447         __ompt_get_mutex_impl_type(user_lock),
2448         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2449   }
2450 #endif
2451 }
2452 
2453 #endif // KMP_USE_DYNAMIC_LOCK
2454 
2455 /* initialize the lock */
2456 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2457 #if KMP_USE_DYNAMIC_LOCK
2458 
2459   KMP_DEBUG_ASSERT(__kmp_init_serial);
2460   if (__kmp_env_consistency_check && user_lock == NULL) {
2461     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2462   }
2463   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2464 
2465 #if OMPT_SUPPORT && OMPT_OPTIONAL
2466   // This is the case, if called from omp_init_lock_with_hint:
2467   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2468   if (!codeptr)
2469     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2470   if (ompt_enabled.ompt_callback_lock_init) {
2471     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2472         ompt_mutex_lock, omp_lock_hint_none,
2473         __ompt_get_mutex_impl_type(user_lock),
2474         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2475   }
2476 #endif
2477 
2478 #else // KMP_USE_DYNAMIC_LOCK
2479 
2480   static char const *const func = "omp_init_lock";
2481   kmp_user_lock_p lck;
2482   KMP_DEBUG_ASSERT(__kmp_init_serial);
2483 
2484   if (__kmp_env_consistency_check) {
2485     if (user_lock == NULL) {
2486       KMP_FATAL(LockIsUninitialized, func);
2487     }
2488   }
2489 
2490   KMP_CHECK_USER_LOCK_INIT();
2491 
2492   if ((__kmp_user_lock_kind == lk_tas) &&
2493       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2494     lck = (kmp_user_lock_p)user_lock;
2495   }
2496 #if KMP_USE_FUTEX
2497   else if ((__kmp_user_lock_kind == lk_futex) &&
2498            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2499     lck = (kmp_user_lock_p)user_lock;
2500   }
2501 #endif
2502   else {
2503     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2504   }
2505   INIT_LOCK(lck);
2506   __kmp_set_user_lock_location(lck, loc);
2507 
2508 #if OMPT_SUPPORT && OMPT_OPTIONAL
2509   // This is the case, if called from omp_init_lock_with_hint:
2510   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2511   if (!codeptr)
2512     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2513   if (ompt_enabled.ompt_callback_lock_init) {
2514     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2515         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2516         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2517   }
2518 #endif
2519 
2520 #if USE_ITT_BUILD
2521   __kmp_itt_lock_creating(lck);
2522 #endif /* USE_ITT_BUILD */
2523 
2524 #endif // KMP_USE_DYNAMIC_LOCK
2525 } // __kmpc_init_lock
2526 
2527 /* initialize the lock */
2528 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2529 #if KMP_USE_DYNAMIC_LOCK
2530 
2531   KMP_DEBUG_ASSERT(__kmp_init_serial);
2532   if (__kmp_env_consistency_check && user_lock == NULL) {
2533     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2534   }
2535   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2536 
2537 #if OMPT_SUPPORT && OMPT_OPTIONAL
2538   // This is the case, if called from omp_init_lock_with_hint:
2539   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2540   if (!codeptr)
2541     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2542   if (ompt_enabled.ompt_callback_lock_init) {
2543     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2544         ompt_mutex_nest_lock, omp_lock_hint_none,
2545         __ompt_get_mutex_impl_type(user_lock),
2546         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2547   }
2548 #endif
2549 
2550 #else // KMP_USE_DYNAMIC_LOCK
2551 
2552   static char const *const func = "omp_init_nest_lock";
2553   kmp_user_lock_p lck;
2554   KMP_DEBUG_ASSERT(__kmp_init_serial);
2555 
2556   if (__kmp_env_consistency_check) {
2557     if (user_lock == NULL) {
2558       KMP_FATAL(LockIsUninitialized, func);
2559     }
2560   }
2561 
2562   KMP_CHECK_USER_LOCK_INIT();
2563 
2564   if ((__kmp_user_lock_kind == lk_tas) &&
2565       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2566        OMP_NEST_LOCK_T_SIZE)) {
2567     lck = (kmp_user_lock_p)user_lock;
2568   }
2569 #if KMP_USE_FUTEX
2570   else if ((__kmp_user_lock_kind == lk_futex) &&
2571            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2572             OMP_NEST_LOCK_T_SIZE)) {
2573     lck = (kmp_user_lock_p)user_lock;
2574   }
2575 #endif
2576   else {
2577     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2578   }
2579 
2580   INIT_NESTED_LOCK(lck);
2581   __kmp_set_user_lock_location(lck, loc);
2582 
2583 #if OMPT_SUPPORT && OMPT_OPTIONAL
2584   // This is the case, if called from omp_init_lock_with_hint:
2585   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2586   if (!codeptr)
2587     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2588   if (ompt_enabled.ompt_callback_lock_init) {
2589     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2590         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2591         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2592   }
2593 #endif
2594 
2595 #if USE_ITT_BUILD
2596   __kmp_itt_lock_creating(lck);
2597 #endif /* USE_ITT_BUILD */
2598 
2599 #endif // KMP_USE_DYNAMIC_LOCK
2600 } // __kmpc_init_nest_lock
2601 
2602 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2603 #if KMP_USE_DYNAMIC_LOCK
2604 
2605 #if USE_ITT_BUILD
2606   kmp_user_lock_p lck;
2607   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2608     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2609   } else {
2610     lck = (kmp_user_lock_p)user_lock;
2611   }
2612   __kmp_itt_lock_destroyed(lck);
2613 #endif
2614 #if OMPT_SUPPORT && OMPT_OPTIONAL
2615   // This is the case, if called from omp_init_lock_with_hint:
2616   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2617   if (!codeptr)
2618     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2619   if (ompt_enabled.ompt_callback_lock_destroy) {
2620     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2621         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2622   }
2623 #endif
2624   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2625 #else
2626   kmp_user_lock_p lck;
2627 
2628   if ((__kmp_user_lock_kind == lk_tas) &&
2629       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2630     lck = (kmp_user_lock_p)user_lock;
2631   }
2632 #if KMP_USE_FUTEX
2633   else if ((__kmp_user_lock_kind == lk_futex) &&
2634            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2635     lck = (kmp_user_lock_p)user_lock;
2636   }
2637 #endif
2638   else {
2639     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2640   }
2641 
2642 #if OMPT_SUPPORT && OMPT_OPTIONAL
2643   // This is the case, if called from omp_init_lock_with_hint:
2644   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2645   if (!codeptr)
2646     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2647   if (ompt_enabled.ompt_callback_lock_destroy) {
2648     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2649         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2650   }
2651 #endif
2652 
2653 #if USE_ITT_BUILD
2654   __kmp_itt_lock_destroyed(lck);
2655 #endif /* USE_ITT_BUILD */
2656   DESTROY_LOCK(lck);
2657 
2658   if ((__kmp_user_lock_kind == lk_tas) &&
2659       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2660     ;
2661   }
2662 #if KMP_USE_FUTEX
2663   else if ((__kmp_user_lock_kind == lk_futex) &&
2664            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2665     ;
2666   }
2667 #endif
2668   else {
2669     __kmp_user_lock_free(user_lock, gtid, lck);
2670   }
2671 #endif // KMP_USE_DYNAMIC_LOCK
2672 } // __kmpc_destroy_lock
2673 
2674 /* destroy the lock */
2675 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2676 #if KMP_USE_DYNAMIC_LOCK
2677 
2678 #if USE_ITT_BUILD
2679   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2680   __kmp_itt_lock_destroyed(ilk->lock);
2681 #endif
2682 #if OMPT_SUPPORT && OMPT_OPTIONAL
2683   // This is the case, if called from omp_init_lock_with_hint:
2684   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2685   if (!codeptr)
2686     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2687   if (ompt_enabled.ompt_callback_lock_destroy) {
2688     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2689         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2690   }
2691 #endif
2692   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2693 
2694 #else // KMP_USE_DYNAMIC_LOCK
2695 
2696   kmp_user_lock_p lck;
2697 
2698   if ((__kmp_user_lock_kind == lk_tas) &&
2699       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2700        OMP_NEST_LOCK_T_SIZE)) {
2701     lck = (kmp_user_lock_p)user_lock;
2702   }
2703 #if KMP_USE_FUTEX
2704   else if ((__kmp_user_lock_kind == lk_futex) &&
2705            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2706             OMP_NEST_LOCK_T_SIZE)) {
2707     lck = (kmp_user_lock_p)user_lock;
2708   }
2709 #endif
2710   else {
2711     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2712   }
2713 
2714 #if OMPT_SUPPORT && OMPT_OPTIONAL
2715   // This is the case, if called from omp_init_lock_with_hint:
2716   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2717   if (!codeptr)
2718     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2719   if (ompt_enabled.ompt_callback_lock_destroy) {
2720     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2721         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2722   }
2723 #endif
2724 
2725 #if USE_ITT_BUILD
2726   __kmp_itt_lock_destroyed(lck);
2727 #endif /* USE_ITT_BUILD */
2728 
2729   DESTROY_NESTED_LOCK(lck);
2730 
2731   if ((__kmp_user_lock_kind == lk_tas) &&
2732       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2733        OMP_NEST_LOCK_T_SIZE)) {
2734     ;
2735   }
2736 #if KMP_USE_FUTEX
2737   else if ((__kmp_user_lock_kind == lk_futex) &&
2738            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2739             OMP_NEST_LOCK_T_SIZE)) {
2740     ;
2741   }
2742 #endif
2743   else {
2744     __kmp_user_lock_free(user_lock, gtid, lck);
2745   }
2746 #endif // KMP_USE_DYNAMIC_LOCK
2747 } // __kmpc_destroy_nest_lock
2748 
2749 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2750   KMP_COUNT_BLOCK(OMP_set_lock);
2751 #if KMP_USE_DYNAMIC_LOCK
2752   int tag = KMP_EXTRACT_D_TAG(user_lock);
2753 #if USE_ITT_BUILD
2754   __kmp_itt_lock_acquiring(
2755       (kmp_user_lock_p)
2756           user_lock); // itt function will get to the right lock object.
2757 #endif
2758 #if OMPT_SUPPORT && OMPT_OPTIONAL
2759   // This is the case, if called from omp_init_lock_with_hint:
2760   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2761   if (!codeptr)
2762     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2763   if (ompt_enabled.ompt_callback_mutex_acquire) {
2764     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2765         ompt_mutex_lock, omp_lock_hint_none,
2766         __ompt_get_mutex_impl_type(user_lock),
2767         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2768   }
2769 #endif
2770 #if KMP_USE_INLINED_TAS
2771   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2772     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2773   } else
2774 #elif KMP_USE_INLINED_FUTEX
2775   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2776     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2777   } else
2778 #endif
2779   {
2780     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2781   }
2782 #if USE_ITT_BUILD
2783   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2784 #endif
2785 #if OMPT_SUPPORT && OMPT_OPTIONAL
2786   if (ompt_enabled.ompt_callback_mutex_acquired) {
2787     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2788         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2789   }
2790 #endif
2791 
2792 #else // KMP_USE_DYNAMIC_LOCK
2793 
2794   kmp_user_lock_p lck;
2795 
2796   if ((__kmp_user_lock_kind == lk_tas) &&
2797       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2798     lck = (kmp_user_lock_p)user_lock;
2799   }
2800 #if KMP_USE_FUTEX
2801   else if ((__kmp_user_lock_kind == lk_futex) &&
2802            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2803     lck = (kmp_user_lock_p)user_lock;
2804   }
2805 #endif
2806   else {
2807     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2808   }
2809 
2810 #if USE_ITT_BUILD
2811   __kmp_itt_lock_acquiring(lck);
2812 #endif /* USE_ITT_BUILD */
2813 #if OMPT_SUPPORT && OMPT_OPTIONAL
2814   // This is the case, if called from omp_init_lock_with_hint:
2815   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2816   if (!codeptr)
2817     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2818   if (ompt_enabled.ompt_callback_mutex_acquire) {
2819     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2820         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2821         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2822   }
2823 #endif
2824 
2825   ACQUIRE_LOCK(lck, gtid);
2826 
2827 #if USE_ITT_BUILD
2828   __kmp_itt_lock_acquired(lck);
2829 #endif /* USE_ITT_BUILD */
2830 
2831 #if OMPT_SUPPORT && OMPT_OPTIONAL
2832   if (ompt_enabled.ompt_callback_mutex_acquired) {
2833     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2834         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2835   }
2836 #endif
2837 
2838 #endif // KMP_USE_DYNAMIC_LOCK
2839 }
2840 
2841 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2842 #if KMP_USE_DYNAMIC_LOCK
2843 
2844 #if USE_ITT_BUILD
2845   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2846 #endif
2847 #if OMPT_SUPPORT && OMPT_OPTIONAL
2848   // This is the case, if called from omp_init_lock_with_hint:
2849   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2850   if (!codeptr)
2851     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2852   if (ompt_enabled.enabled) {
2853     if (ompt_enabled.ompt_callback_mutex_acquire) {
2854       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2855           ompt_mutex_nest_lock, omp_lock_hint_none,
2856           __ompt_get_mutex_impl_type(user_lock),
2857           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2858     }
2859   }
2860 #endif
2861   int acquire_status =
2862       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2863   (void)acquire_status;
2864 #if USE_ITT_BUILD
2865   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2866 #endif
2867 
2868 #if OMPT_SUPPORT && OMPT_OPTIONAL
2869   if (ompt_enabled.enabled) {
2870     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2871       if (ompt_enabled.ompt_callback_mutex_acquired) {
2872         // lock_first
2873         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2874             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2875             codeptr);
2876       }
2877     } else {
2878       if (ompt_enabled.ompt_callback_nest_lock) {
2879         // lock_next
2880         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2881             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2882       }
2883     }
2884   }
2885 #endif
2886 
2887 #else // KMP_USE_DYNAMIC_LOCK
2888   int acquire_status;
2889   kmp_user_lock_p lck;
2890 
2891   if ((__kmp_user_lock_kind == lk_tas) &&
2892       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2893        OMP_NEST_LOCK_T_SIZE)) {
2894     lck = (kmp_user_lock_p)user_lock;
2895   }
2896 #if KMP_USE_FUTEX
2897   else if ((__kmp_user_lock_kind == lk_futex) &&
2898            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2899             OMP_NEST_LOCK_T_SIZE)) {
2900     lck = (kmp_user_lock_p)user_lock;
2901   }
2902 #endif
2903   else {
2904     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2905   }
2906 
2907 #if USE_ITT_BUILD
2908   __kmp_itt_lock_acquiring(lck);
2909 #endif /* USE_ITT_BUILD */
2910 #if OMPT_SUPPORT && OMPT_OPTIONAL
2911   // This is the case, if called from omp_init_lock_with_hint:
2912   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2913   if (!codeptr)
2914     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2915   if (ompt_enabled.enabled) {
2916     if (ompt_enabled.ompt_callback_mutex_acquire) {
2917       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2918           ompt_mutex_nest_lock, omp_lock_hint_none,
2919           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2920           codeptr);
2921     }
2922   }
2923 #endif
2924 
2925   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2926 
2927 #if USE_ITT_BUILD
2928   __kmp_itt_lock_acquired(lck);
2929 #endif /* USE_ITT_BUILD */
2930 
2931 #if OMPT_SUPPORT && OMPT_OPTIONAL
2932   if (ompt_enabled.enabled) {
2933     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2934       if (ompt_enabled.ompt_callback_mutex_acquired) {
2935         // lock_first
2936         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2937             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2938       }
2939     } else {
2940       if (ompt_enabled.ompt_callback_nest_lock) {
2941         // lock_next
2942         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2943             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2944       }
2945     }
2946   }
2947 #endif
2948 
2949 #endif // KMP_USE_DYNAMIC_LOCK
2950 }
2951 
2952 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2953 #if KMP_USE_DYNAMIC_LOCK
2954 
2955   int tag = KMP_EXTRACT_D_TAG(user_lock);
2956 #if USE_ITT_BUILD
2957   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2958 #endif
2959 #if KMP_USE_INLINED_TAS
2960   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2961     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2962   } else
2963 #elif KMP_USE_INLINED_FUTEX
2964   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2965     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2966   } else
2967 #endif
2968   {
2969     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2970   }
2971 
2972 #if OMPT_SUPPORT && OMPT_OPTIONAL
2973   // This is the case, if called from omp_init_lock_with_hint:
2974   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2975   if (!codeptr)
2976     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2977   if (ompt_enabled.ompt_callback_mutex_released) {
2978     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2979         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2980   }
2981 #endif
2982 
2983 #else // KMP_USE_DYNAMIC_LOCK
2984 
2985   kmp_user_lock_p lck;
2986 
2987   /* Can't use serial interval since not block structured */
2988   /* release the lock */
2989 
2990   if ((__kmp_user_lock_kind == lk_tas) &&
2991       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2992 #if KMP_OS_LINUX &&                                                            \
2993     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2994 // "fast" path implemented to fix customer performance issue
2995 #if USE_ITT_BUILD
2996     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2997 #endif /* USE_ITT_BUILD */
2998     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2999     KMP_MB();
3000 
3001 #if OMPT_SUPPORT && OMPT_OPTIONAL
3002     // This is the case, if called from omp_init_lock_with_hint:
3003     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3004     if (!codeptr)
3005       codeptr = OMPT_GET_RETURN_ADDRESS(0);
3006     if (ompt_enabled.ompt_callback_mutex_released) {
3007       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3008           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3009     }
3010 #endif
3011 
3012     return;
3013 #else
3014     lck = (kmp_user_lock_p)user_lock;
3015 #endif
3016   }
3017 #if KMP_USE_FUTEX
3018   else if ((__kmp_user_lock_kind == lk_futex) &&
3019            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3020     lck = (kmp_user_lock_p)user_lock;
3021   }
3022 #endif
3023   else {
3024     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
3025   }
3026 
3027 #if USE_ITT_BUILD
3028   __kmp_itt_lock_releasing(lck);
3029 #endif /* USE_ITT_BUILD */
3030 
3031   RELEASE_LOCK(lck, gtid);
3032 
3033 #if OMPT_SUPPORT && OMPT_OPTIONAL
3034   // This is the case, if called from omp_init_lock_with_hint:
3035   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3036   if (!codeptr)
3037     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3038   if (ompt_enabled.ompt_callback_mutex_released) {
3039     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3040         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3041   }
3042 #endif
3043 
3044 #endif // KMP_USE_DYNAMIC_LOCK
3045 }
3046 
3047 /* release the lock */
3048 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3049 #if KMP_USE_DYNAMIC_LOCK
3050 
3051 #if USE_ITT_BUILD
3052   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
3053 #endif
3054   int release_status =
3055       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
3056   (void)release_status;
3057 
3058 #if OMPT_SUPPORT && OMPT_OPTIONAL
3059   // This is the case, if called from omp_init_lock_with_hint:
3060   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3061   if (!codeptr)
3062     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3063   if (ompt_enabled.enabled) {
3064     if (release_status == KMP_LOCK_RELEASED) {
3065       if (ompt_enabled.ompt_callback_mutex_released) {
3066         // release_lock_last
3067         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3068             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3069             codeptr);
3070       }
3071     } else if (ompt_enabled.ompt_callback_nest_lock) {
3072       // release_lock_prev
3073       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3074           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3075     }
3076   }
3077 #endif
3078 
3079 #else // KMP_USE_DYNAMIC_LOCK
3080 
3081   kmp_user_lock_p lck;
3082 
3083   /* Can't use serial interval since not block structured */
3084 
3085   if ((__kmp_user_lock_kind == lk_tas) &&
3086       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3087        OMP_NEST_LOCK_T_SIZE)) {
3088 #if KMP_OS_LINUX &&                                                            \
3089     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
3090     // "fast" path implemented to fix customer performance issue
3091     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
3092 #if USE_ITT_BUILD
3093     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
3094 #endif /* USE_ITT_BUILD */
3095 
3096 #if OMPT_SUPPORT && OMPT_OPTIONAL
3097     int release_status = KMP_LOCK_STILL_HELD;
3098 #endif
3099 
3100     if (--(tl->lk.depth_locked) == 0) {
3101       TCW_4(tl->lk.poll, 0);
3102 #if OMPT_SUPPORT && OMPT_OPTIONAL
3103       release_status = KMP_LOCK_RELEASED;
3104 #endif
3105     }
3106     KMP_MB();
3107 
3108 #if OMPT_SUPPORT && OMPT_OPTIONAL
3109     // This is the case, if called from omp_init_lock_with_hint:
3110     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3111     if (!codeptr)
3112       codeptr = OMPT_GET_RETURN_ADDRESS(0);
3113     if (ompt_enabled.enabled) {
3114       if (release_status == KMP_LOCK_RELEASED) {
3115         if (ompt_enabled.ompt_callback_mutex_released) {
3116           // release_lock_last
3117           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3118               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3119         }
3120       } else if (ompt_enabled.ompt_callback_nest_lock) {
3121         // release_lock_previous
3122         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3123             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3124       }
3125     }
3126 #endif
3127 
3128     return;
3129 #else
3130     lck = (kmp_user_lock_p)user_lock;
3131 #endif
3132   }
3133 #if KMP_USE_FUTEX
3134   else if ((__kmp_user_lock_kind == lk_futex) &&
3135            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3136             OMP_NEST_LOCK_T_SIZE)) {
3137     lck = (kmp_user_lock_p)user_lock;
3138   }
3139 #endif
3140   else {
3141     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
3142   }
3143 
3144 #if USE_ITT_BUILD
3145   __kmp_itt_lock_releasing(lck);
3146 #endif /* USE_ITT_BUILD */
3147 
3148   int release_status;
3149   release_status = RELEASE_NESTED_LOCK(lck, gtid);
3150 #if OMPT_SUPPORT && OMPT_OPTIONAL
3151   // This is the case, if called from omp_init_lock_with_hint:
3152   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3153   if (!codeptr)
3154     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3155   if (ompt_enabled.enabled) {
3156     if (release_status == KMP_LOCK_RELEASED) {
3157       if (ompt_enabled.ompt_callback_mutex_released) {
3158         // release_lock_last
3159         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3160             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3161       }
3162     } else if (ompt_enabled.ompt_callback_nest_lock) {
3163       // release_lock_previous
3164       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3165           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3166     }
3167   }
3168 #endif
3169 
3170 #endif // KMP_USE_DYNAMIC_LOCK
3171 }
3172 
3173 /* try to acquire the lock */
3174 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3175   KMP_COUNT_BLOCK(OMP_test_lock);
3176 
3177 #if KMP_USE_DYNAMIC_LOCK
3178   int rc;
3179   int tag = KMP_EXTRACT_D_TAG(user_lock);
3180 #if USE_ITT_BUILD
3181   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3182 #endif
3183 #if OMPT_SUPPORT && OMPT_OPTIONAL
3184   // This is the case, if called from omp_init_lock_with_hint:
3185   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3186   if (!codeptr)
3187     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3188   if (ompt_enabled.ompt_callback_mutex_acquire) {
3189     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3190         ompt_mutex_test_lock, omp_lock_hint_none,
3191         __ompt_get_mutex_impl_type(user_lock),
3192         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3193   }
3194 #endif
3195 #if KMP_USE_INLINED_TAS
3196   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3197     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3198   } else
3199 #elif KMP_USE_INLINED_FUTEX
3200   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3201     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3202   } else
3203 #endif
3204   {
3205     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3206   }
3207   if (rc) {
3208 #if USE_ITT_BUILD
3209     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3210 #endif
3211 #if OMPT_SUPPORT && OMPT_OPTIONAL
3212     if (ompt_enabled.ompt_callback_mutex_acquired) {
3213       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3214           ompt_mutex_test_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3215     }
3216 #endif
3217     return FTN_TRUE;
3218   } else {
3219 #if USE_ITT_BUILD
3220     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3221 #endif
3222     return FTN_FALSE;
3223   }
3224 
3225 #else // KMP_USE_DYNAMIC_LOCK
3226 
3227   kmp_user_lock_p lck;
3228   int rc;
3229 
3230   if ((__kmp_user_lock_kind == lk_tas) &&
3231       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3232     lck = (kmp_user_lock_p)user_lock;
3233   }
3234 #if KMP_USE_FUTEX
3235   else if ((__kmp_user_lock_kind == lk_futex) &&
3236            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3237     lck = (kmp_user_lock_p)user_lock;
3238   }
3239 #endif
3240   else {
3241     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3242   }
3243 
3244 #if USE_ITT_BUILD
3245   __kmp_itt_lock_acquiring(lck);
3246 #endif /* USE_ITT_BUILD */
3247 #if OMPT_SUPPORT && OMPT_OPTIONAL
3248   // This is the case, if called from omp_init_lock_with_hint:
3249   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3250   if (!codeptr)
3251     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3252   if (ompt_enabled.ompt_callback_mutex_acquire) {
3253     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3254         ompt_mutex_test_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3255         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3256   }
3257 #endif
3258 
3259   rc = TEST_LOCK(lck, gtid);
3260 #if USE_ITT_BUILD
3261   if (rc) {
3262     __kmp_itt_lock_acquired(lck);
3263   } else {
3264     __kmp_itt_lock_cancelled(lck);
3265   }
3266 #endif /* USE_ITT_BUILD */
3267 #if OMPT_SUPPORT && OMPT_OPTIONAL
3268   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3269     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3270         ompt_mutex_test_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3271   }
3272 #endif
3273 
3274   return (rc ? FTN_TRUE : FTN_FALSE);
3275 
3276   /* Can't use serial interval since not block structured */
3277 
3278 #endif // KMP_USE_DYNAMIC_LOCK
3279 }
3280 
3281 /* try to acquire the lock */
3282 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3283 #if KMP_USE_DYNAMIC_LOCK
3284   int rc;
3285 #if USE_ITT_BUILD
3286   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3287 #endif
3288 #if OMPT_SUPPORT && OMPT_OPTIONAL
3289   // This is the case, if called from omp_init_lock_with_hint:
3290   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3291   if (!codeptr)
3292     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3293   if (ompt_enabled.ompt_callback_mutex_acquire) {
3294     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3295         ompt_mutex_test_nest_lock, omp_lock_hint_none,
3296         __ompt_get_mutex_impl_type(user_lock),
3297         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3298   }
3299 #endif
3300   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3301 #if USE_ITT_BUILD
3302   if (rc) {
3303     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3304   } else {
3305     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3306   }
3307 #endif
3308 #if OMPT_SUPPORT && OMPT_OPTIONAL
3309   if (ompt_enabled.enabled && rc) {
3310     if (rc == 1) {
3311       if (ompt_enabled.ompt_callback_mutex_acquired) {
3312         // lock_first
3313         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3314             ompt_mutex_test_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3315             codeptr);
3316       }
3317     } else {
3318       if (ompt_enabled.ompt_callback_nest_lock) {
3319         // lock_next
3320         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3321             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3322       }
3323     }
3324   }
3325 #endif
3326   return rc;
3327 
3328 #else // KMP_USE_DYNAMIC_LOCK
3329 
3330   kmp_user_lock_p lck;
3331   int rc;
3332 
3333   if ((__kmp_user_lock_kind == lk_tas) &&
3334       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3335        OMP_NEST_LOCK_T_SIZE)) {
3336     lck = (kmp_user_lock_p)user_lock;
3337   }
3338 #if KMP_USE_FUTEX
3339   else if ((__kmp_user_lock_kind == lk_futex) &&
3340            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3341             OMP_NEST_LOCK_T_SIZE)) {
3342     lck = (kmp_user_lock_p)user_lock;
3343   }
3344 #endif
3345   else {
3346     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3347   }
3348 
3349 #if USE_ITT_BUILD
3350   __kmp_itt_lock_acquiring(lck);
3351 #endif /* USE_ITT_BUILD */
3352 
3353 #if OMPT_SUPPORT && OMPT_OPTIONAL
3354   // This is the case, if called from omp_init_lock_with_hint:
3355   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3356   if (!codeptr)
3357     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3358   if (ompt_enabled.enabled) &&
3359         ompt_enabled.ompt_callback_mutex_acquire) {
3360       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3361           ompt_mutex_test_nest_lock, omp_lock_hint_none,
3362           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3363           codeptr);
3364     }
3365 #endif
3366 
3367   rc = TEST_NESTED_LOCK(lck, gtid);
3368 #if USE_ITT_BUILD
3369   if (rc) {
3370     __kmp_itt_lock_acquired(lck);
3371   } else {
3372     __kmp_itt_lock_cancelled(lck);
3373   }
3374 #endif /* USE_ITT_BUILD */
3375 #if OMPT_SUPPORT && OMPT_OPTIONAL
3376   if (ompt_enabled.enabled && rc) {
3377     if (rc == 1) {
3378       if (ompt_enabled.ompt_callback_mutex_acquired) {
3379         // lock_first
3380         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3381             ompt_mutex_test_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3382       }
3383     } else {
3384       if (ompt_enabled.ompt_callback_nest_lock) {
3385         // lock_next
3386         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3387             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3388       }
3389     }
3390   }
3391 #endif
3392   return rc;
3393 
3394   /* Can't use serial interval since not block structured */
3395 
3396 #endif // KMP_USE_DYNAMIC_LOCK
3397 }
3398 
3399 // Interface to fast scalable reduce methods routines
3400 
3401 // keep the selected method in a thread local structure for cross-function
3402 // usage: will be used in __kmpc_end_reduce* functions;
3403 // another solution: to re-determine the method one more time in
3404 // __kmpc_end_reduce* functions (new prototype required then)
3405 // AT: which solution is better?
3406 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3407   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3408 
3409 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3410   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3411 
3412 // description of the packed_reduction_method variable: look at the macros in
3413 // kmp.h
3414 
3415 // used in a critical section reduce block
3416 static __forceinline void
3417 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3418                                           kmp_critical_name *crit) {
3419 
3420   // this lock was visible to a customer and to the threading profile tool as a
3421   // serial overhead span (although it's used for an internal purpose only)
3422   //            why was it visible in previous implementation?
3423   //            should we keep it visible in new reduce block?
3424   kmp_user_lock_p lck;
3425 
3426 #if KMP_USE_DYNAMIC_LOCK
3427 
3428   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3429   // Check if it is initialized.
3430   if (*lk == 0) {
3431     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3432       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3433                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3434     } else {
3435       __kmp_init_indirect_csptr(crit, loc, global_tid,
3436                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3437     }
3438   }
3439   // Branch for accessing the actual lock object and set operation. This
3440   // branching is inevitable since this lock initialization does not follow the
3441   // normal dispatch path (lock table is not used).
3442   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3443     lck = (kmp_user_lock_p)lk;
3444     KMP_DEBUG_ASSERT(lck != NULL);
3445     if (__kmp_env_consistency_check) {
3446       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3447     }
3448     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3449   } else {
3450     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3451     lck = ilk->lock;
3452     KMP_DEBUG_ASSERT(lck != NULL);
3453     if (__kmp_env_consistency_check) {
3454       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3455     }
3456     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3457   }
3458 
3459 #else // KMP_USE_DYNAMIC_LOCK
3460 
3461   // We know that the fast reduction code is only emitted by Intel compilers
3462   // with 32 byte critical sections. If there isn't enough space, then we
3463   // have to use a pointer.
3464   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3465     lck = (kmp_user_lock_p)crit;
3466   } else {
3467     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3468   }
3469   KMP_DEBUG_ASSERT(lck != NULL);
3470 
3471   if (__kmp_env_consistency_check)
3472     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3473 
3474   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3475 
3476 #endif // KMP_USE_DYNAMIC_LOCK
3477 }
3478 
3479 // used in a critical section reduce block
3480 static __forceinline void
3481 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3482                                         kmp_critical_name *crit) {
3483 
3484   kmp_user_lock_p lck;
3485 
3486 #if KMP_USE_DYNAMIC_LOCK
3487 
3488   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3489     lck = (kmp_user_lock_p)crit;
3490     if (__kmp_env_consistency_check)
3491       __kmp_pop_sync(global_tid, ct_critical, loc);
3492     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3493   } else {
3494     kmp_indirect_lock_t *ilk =
3495         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3496     if (__kmp_env_consistency_check)
3497       __kmp_pop_sync(global_tid, ct_critical, loc);
3498     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3499   }
3500 
3501 #else // KMP_USE_DYNAMIC_LOCK
3502 
3503   // We know that the fast reduction code is only emitted by Intel compilers
3504   // with 32 byte critical sections. If there isn't enough space, then we have
3505   // to use a pointer.
3506   if (__kmp_base_user_lock_size > 32) {
3507     lck = *((kmp_user_lock_p *)crit);
3508     KMP_ASSERT(lck != NULL);
3509   } else {
3510     lck = (kmp_user_lock_p)crit;
3511   }
3512 
3513   if (__kmp_env_consistency_check)
3514     __kmp_pop_sync(global_tid, ct_critical, loc);
3515 
3516   __kmp_release_user_lock_with_checks(lck, global_tid);
3517 
3518 #endif // KMP_USE_DYNAMIC_LOCK
3519 } // __kmp_end_critical_section_reduce_block
3520 
3521 static __forceinline int
3522 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3523                                      int *task_state) {
3524   kmp_team_t *team;
3525 
3526   // Check if we are inside the teams construct?
3527   if (th->th.th_teams_microtask) {
3528     *team_p = team = th->th.th_team;
3529     if (team->t.t_level == th->th.th_teams_level) {
3530       // This is reduction at teams construct.
3531       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3532       // Let's swap teams temporarily for the reduction.
3533       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3534       th->th.th_team = team->t.t_parent;
3535       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3536       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3537       *task_state = th->th.th_task_state;
3538       th->th.th_task_state = 0;
3539 
3540       return 1;
3541     }
3542   }
3543   return 0;
3544 }
3545 
3546 static __forceinline void
3547 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3548   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3549   th->th.th_info.ds.ds_tid = 0;
3550   th->th.th_team = team;
3551   th->th.th_team_nproc = team->t.t_nproc;
3552   th->th.th_task_team = team->t.t_task_team[task_state];
3553   __kmp_type_convert(task_state, &(th->th.th_task_state));
3554 }
3555 
3556 /* 2.a.i. Reduce Block without a terminating barrier */
3557 /*!
3558 @ingroup SYNCHRONIZATION
3559 @param loc source location information
3560 @param global_tid global thread number
3561 @param num_vars number of items (variables) to be reduced
3562 @param reduce_size size of data in bytes to be reduced
3563 @param reduce_data pointer to data to be reduced
3564 @param reduce_func callback function providing reduction operation on two
3565 operands and returning result of reduction in lhs_data
3566 @param lck pointer to the unique lock data structure
3567 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3568 threads if atomic reduction needed
3569 
3570 The nowait version is used for a reduce clause with the nowait argument.
3571 */
3572 kmp_int32
3573 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3574                      size_t reduce_size, void *reduce_data,
3575                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3576                      kmp_critical_name *lck) {
3577 
3578   KMP_COUNT_BLOCK(REDUCE_nowait);
3579   int retval = 0;
3580   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3581   kmp_info_t *th;
3582   kmp_team_t *team;
3583   int teams_swapped = 0, task_state;
3584   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3585   __kmp_assert_valid_gtid(global_tid);
3586 
3587   // why do we need this initialization here at all?
3588   // Reduction clause can not be used as a stand-alone directive.
3589 
3590   // do not call __kmp_serial_initialize(), it will be called by
3591   // __kmp_parallel_initialize() if needed
3592   // possible detection of false-positive race by the threadchecker ???
3593   if (!TCR_4(__kmp_init_parallel))
3594     __kmp_parallel_initialize();
3595 
3596   __kmp_resume_if_soft_paused();
3597 
3598 // check correctness of reduce block nesting
3599 #if KMP_USE_DYNAMIC_LOCK
3600   if (__kmp_env_consistency_check)
3601     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3602 #else
3603   if (__kmp_env_consistency_check)
3604     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3605 #endif
3606 
3607   th = __kmp_thread_from_gtid(global_tid);
3608   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3609 
3610   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3611   // the value should be kept in a variable
3612   // the variable should be either a construct-specific or thread-specific
3613   // property, not a team specific property
3614   //     (a thread can reach the next reduce block on the next construct, reduce
3615   //     method may differ on the next construct)
3616   // an ident_t "loc" parameter could be used as a construct-specific property
3617   // (what if loc == 0?)
3618   //     (if both construct-specific and team-specific variables were shared,
3619   //     then unness extra syncs should be needed)
3620   // a thread-specific variable is better regarding two issues above (next
3621   // construct and extra syncs)
3622   // a thread-specific "th_local.reduction_method" variable is used currently
3623   // each thread executes 'determine' and 'set' lines (no need to execute by one
3624   // thread, to avoid unness extra syncs)
3625 
3626   packed_reduction_method = __kmp_determine_reduction_method(
3627       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3628   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3629 
3630   OMPT_REDUCTION_DECL(th, global_tid);
3631   if (packed_reduction_method == critical_reduce_block) {
3632 
3633     OMPT_REDUCTION_BEGIN;
3634 
3635     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3636     retval = 1;
3637 
3638   } else if (packed_reduction_method == empty_reduce_block) {
3639 
3640     OMPT_REDUCTION_BEGIN;
3641 
3642     // usage: if team size == 1, no synchronization is required ( Intel
3643     // platforms only )
3644     retval = 1;
3645 
3646   } else if (packed_reduction_method == atomic_reduce_block) {
3647 
3648     retval = 2;
3649 
3650     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3651     // won't be called by the code gen)
3652     //     (it's not quite good, because the checking block has been closed by
3653     //     this 'pop',
3654     //      but atomic operation has not been executed yet, will be executed
3655     //      slightly later, literally on next instruction)
3656     if (__kmp_env_consistency_check)
3657       __kmp_pop_sync(global_tid, ct_reduce, loc);
3658 
3659   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3660                                    tree_reduce_block)) {
3661 
3662 // AT: performance issue: a real barrier here
3663 // AT: (if primary thread is slow, other threads are blocked here waiting for
3664 //      the primary thread to come and release them)
3665 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3666 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3667 //      be confusing to a customer)
3668 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3669 // might go faster and be more in line with sense of NOWAIT
3670 // AT: TO DO: do epcc test and compare times
3671 
3672 // this barrier should be invisible to a customer and to the threading profile
3673 // tool (it's neither a terminating barrier nor customer's code, it's
3674 // used for an internal purpose)
3675 #if OMPT_SUPPORT
3676     // JP: can this barrier potentially leed to task scheduling?
3677     // JP: as long as there is a barrier in the implementation, OMPT should and
3678     // will provide the barrier events
3679     //         so we set-up the necessary frame/return addresses.
3680     ompt_frame_t *ompt_frame;
3681     if (ompt_enabled.enabled) {
3682       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3683       if (ompt_frame->enter_frame.ptr == NULL)
3684         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3685     }
3686     OMPT_STORE_RETURN_ADDRESS(global_tid);
3687 #endif
3688 #if USE_ITT_NOTIFY
3689     __kmp_threads[global_tid]->th.th_ident = loc;
3690 #endif
3691     retval =
3692         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3693                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3694     retval = (retval != 0) ? (0) : (1);
3695 #if OMPT_SUPPORT && OMPT_OPTIONAL
3696     if (ompt_enabled.enabled) {
3697       ompt_frame->enter_frame = ompt_data_none;
3698     }
3699 #endif
3700 
3701     // all other workers except primary thread should do this pop here
3702     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3703     if (__kmp_env_consistency_check) {
3704       if (retval == 0) {
3705         __kmp_pop_sync(global_tid, ct_reduce, loc);
3706       }
3707     }
3708 
3709   } else {
3710 
3711     // should never reach this block
3712     KMP_ASSERT(0); // "unexpected method"
3713   }
3714   if (teams_swapped) {
3715     __kmp_restore_swapped_teams(th, team, task_state);
3716   }
3717   KA_TRACE(
3718       10,
3719       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3720        global_tid, packed_reduction_method, retval));
3721 
3722   return retval;
3723 }
3724 
3725 /*!
3726 @ingroup SYNCHRONIZATION
3727 @param loc source location information
3728 @param global_tid global thread id.
3729 @param lck pointer to the unique lock data structure
3730 
3731 Finish the execution of a reduce nowait.
3732 */
3733 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3734                               kmp_critical_name *lck) {
3735 
3736   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3737 
3738   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3739   __kmp_assert_valid_gtid(global_tid);
3740 
3741   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3742 
3743   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3744 
3745   if (packed_reduction_method == critical_reduce_block) {
3746 
3747     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3748     OMPT_REDUCTION_END;
3749 
3750   } else if (packed_reduction_method == empty_reduce_block) {
3751 
3752     // usage: if team size == 1, no synchronization is required ( on Intel
3753     // platforms only )
3754 
3755     OMPT_REDUCTION_END;
3756 
3757   } else if (packed_reduction_method == atomic_reduce_block) {
3758 
3759     // neither primary thread nor other workers should get here
3760     //     (code gen does not generate this call in case 2: atomic reduce block)
3761     // actually it's better to remove this elseif at all;
3762     // after removal this value will checked by the 'else' and will assert
3763 
3764   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3765                                    tree_reduce_block)) {
3766 
3767     // only primary thread gets here
3768     // OMPT: tree reduction is annotated in the barrier code
3769 
3770   } else {
3771 
3772     // should never reach this block
3773     KMP_ASSERT(0); // "unexpected method"
3774   }
3775 
3776   if (__kmp_env_consistency_check)
3777     __kmp_pop_sync(global_tid, ct_reduce, loc);
3778 
3779   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3780                 global_tid, packed_reduction_method));
3781 
3782   return;
3783 }
3784 
3785 /* 2.a.ii. Reduce Block with a terminating barrier */
3786 
3787 /*!
3788 @ingroup SYNCHRONIZATION
3789 @param loc source location information
3790 @param global_tid global thread number
3791 @param num_vars number of items (variables) to be reduced
3792 @param reduce_size size of data in bytes to be reduced
3793 @param reduce_data pointer to data to be reduced
3794 @param reduce_func callback function providing reduction operation on two
3795 operands and returning result of reduction in lhs_data
3796 @param lck pointer to the unique lock data structure
3797 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3798 threads if atomic reduction needed
3799 
3800 A blocking reduce that includes an implicit barrier.
3801 */
3802 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3803                         size_t reduce_size, void *reduce_data,
3804                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3805                         kmp_critical_name *lck) {
3806   KMP_COUNT_BLOCK(REDUCE_wait);
3807   int retval = 0;
3808   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3809   kmp_info_t *th;
3810   kmp_team_t *team;
3811   int teams_swapped = 0, task_state;
3812 
3813   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3814   __kmp_assert_valid_gtid(global_tid);
3815 
3816   // why do we need this initialization here at all?
3817   // Reduction clause can not be a stand-alone directive.
3818 
3819   // do not call __kmp_serial_initialize(), it will be called by
3820   // __kmp_parallel_initialize() if needed
3821   // possible detection of false-positive race by the threadchecker ???
3822   if (!TCR_4(__kmp_init_parallel))
3823     __kmp_parallel_initialize();
3824 
3825   __kmp_resume_if_soft_paused();
3826 
3827 // check correctness of reduce block nesting
3828 #if KMP_USE_DYNAMIC_LOCK
3829   if (__kmp_env_consistency_check)
3830     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3831 #else
3832   if (__kmp_env_consistency_check)
3833     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3834 #endif
3835 
3836   th = __kmp_thread_from_gtid(global_tid);
3837   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3838 
3839   packed_reduction_method = __kmp_determine_reduction_method(
3840       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3841   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3842 
3843   OMPT_REDUCTION_DECL(th, global_tid);
3844 
3845   if (packed_reduction_method == critical_reduce_block) {
3846 
3847     OMPT_REDUCTION_BEGIN;
3848     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3849     retval = 1;
3850 
3851   } else if (packed_reduction_method == empty_reduce_block) {
3852 
3853     OMPT_REDUCTION_BEGIN;
3854     // usage: if team size == 1, no synchronization is required ( Intel
3855     // platforms only )
3856     retval = 1;
3857 
3858   } else if (packed_reduction_method == atomic_reduce_block) {
3859 
3860     retval = 2;
3861 
3862   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3863                                    tree_reduce_block)) {
3864 
3865 // case tree_reduce_block:
3866 // this barrier should be visible to a customer and to the threading profile
3867 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3868 #if OMPT_SUPPORT
3869     ompt_frame_t *ompt_frame;
3870     if (ompt_enabled.enabled) {
3871       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3872       if (ompt_frame->enter_frame.ptr == NULL)
3873         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3874     }
3875     OMPT_STORE_RETURN_ADDRESS(global_tid);
3876 #endif
3877 #if USE_ITT_NOTIFY
3878     __kmp_threads[global_tid]->th.th_ident =
3879         loc; // needed for correct notification of frames
3880 #endif
3881     retval =
3882         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3883                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3884     retval = (retval != 0) ? (0) : (1);
3885 #if OMPT_SUPPORT && OMPT_OPTIONAL
3886     if (ompt_enabled.enabled) {
3887       ompt_frame->enter_frame = ompt_data_none;
3888     }
3889 #endif
3890 
3891     // all other workers except primary thread should do this pop here
3892     // (none of other workers except primary will enter __kmpc_end_reduce())
3893     if (__kmp_env_consistency_check) {
3894       if (retval == 0) { // 0: all other workers; 1: primary thread
3895         __kmp_pop_sync(global_tid, ct_reduce, loc);
3896       }
3897     }
3898 
3899   } else {
3900 
3901     // should never reach this block
3902     KMP_ASSERT(0); // "unexpected method"
3903   }
3904   if (teams_swapped) {
3905     __kmp_restore_swapped_teams(th, team, task_state);
3906   }
3907 
3908   KA_TRACE(10,
3909            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3910             global_tid, packed_reduction_method, retval));
3911   return retval;
3912 }
3913 
3914 /*!
3915 @ingroup SYNCHRONIZATION
3916 @param loc source location information
3917 @param global_tid global thread id.
3918 @param lck pointer to the unique lock data structure
3919 
3920 Finish the execution of a blocking reduce.
3921 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3922 start function.
3923 */
3924 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3925                        kmp_critical_name *lck) {
3926 
3927   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3928   kmp_info_t *th;
3929   kmp_team_t *team;
3930   int teams_swapped = 0, task_state;
3931 
3932   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3933   __kmp_assert_valid_gtid(global_tid);
3934 
3935   th = __kmp_thread_from_gtid(global_tid);
3936   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3937 
3938   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3939 
3940   // this barrier should be visible to a customer and to the threading profile
3941   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3942   OMPT_REDUCTION_DECL(th, global_tid);
3943 
3944   if (packed_reduction_method == critical_reduce_block) {
3945     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3946 
3947     OMPT_REDUCTION_END;
3948 
3949 // TODO: implicit barrier: should be exposed
3950 #if OMPT_SUPPORT
3951     ompt_frame_t *ompt_frame;
3952     if (ompt_enabled.enabled) {
3953       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3954       if (ompt_frame->enter_frame.ptr == NULL)
3955         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3956     }
3957     OMPT_STORE_RETURN_ADDRESS(global_tid);
3958 #endif
3959 #if USE_ITT_NOTIFY
3960     __kmp_threads[global_tid]->th.th_ident = loc;
3961 #endif
3962     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3963 #if OMPT_SUPPORT && OMPT_OPTIONAL
3964     if (ompt_enabled.enabled) {
3965       ompt_frame->enter_frame = ompt_data_none;
3966     }
3967 #endif
3968 
3969   } else if (packed_reduction_method == empty_reduce_block) {
3970 
3971     OMPT_REDUCTION_END;
3972 
3973 // usage: if team size==1, no synchronization is required (Intel platforms only)
3974 
3975 // TODO: implicit barrier: should be exposed
3976 #if OMPT_SUPPORT
3977     ompt_frame_t *ompt_frame;
3978     if (ompt_enabled.enabled) {
3979       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3980       if (ompt_frame->enter_frame.ptr == NULL)
3981         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3982     }
3983     OMPT_STORE_RETURN_ADDRESS(global_tid);
3984 #endif
3985 #if USE_ITT_NOTIFY
3986     __kmp_threads[global_tid]->th.th_ident = loc;
3987 #endif
3988     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3989 #if OMPT_SUPPORT && OMPT_OPTIONAL
3990     if (ompt_enabled.enabled) {
3991       ompt_frame->enter_frame = ompt_data_none;
3992     }
3993 #endif
3994 
3995   } else if (packed_reduction_method == atomic_reduce_block) {
3996 
3997 #if OMPT_SUPPORT
3998     ompt_frame_t *ompt_frame;
3999     if (ompt_enabled.enabled) {
4000       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
4001       if (ompt_frame->enter_frame.ptr == NULL)
4002         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
4003     }
4004     OMPT_STORE_RETURN_ADDRESS(global_tid);
4005 #endif
4006 // TODO: implicit barrier: should be exposed
4007 #if USE_ITT_NOTIFY
4008     __kmp_threads[global_tid]->th.th_ident = loc;
4009 #endif
4010     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
4011 #if OMPT_SUPPORT && OMPT_OPTIONAL
4012     if (ompt_enabled.enabled) {
4013       ompt_frame->enter_frame = ompt_data_none;
4014     }
4015 #endif
4016 
4017   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
4018                                    tree_reduce_block)) {
4019 
4020     // only primary thread executes here (primary releases all other workers)
4021     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
4022                             global_tid);
4023 
4024   } else {
4025 
4026     // should never reach this block
4027     KMP_ASSERT(0); // "unexpected method"
4028   }
4029   if (teams_swapped) {
4030     __kmp_restore_swapped_teams(th, team, task_state);
4031   }
4032 
4033   if (__kmp_env_consistency_check)
4034     __kmp_pop_sync(global_tid, ct_reduce, loc);
4035 
4036   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
4037                 global_tid, packed_reduction_method));
4038 
4039   return;
4040 }
4041 
4042 #undef __KMP_GET_REDUCTION_METHOD
4043 #undef __KMP_SET_REDUCTION_METHOD
4044 
4045 /* end of interface to fast scalable reduce routines */
4046 
4047 kmp_uint64 __kmpc_get_taskid() {
4048 
4049   kmp_int32 gtid;
4050   kmp_info_t *thread;
4051 
4052   gtid = __kmp_get_gtid();
4053   if (gtid < 0) {
4054     return 0;
4055   }
4056   thread = __kmp_thread_from_gtid(gtid);
4057   return thread->th.th_current_task->td_task_id;
4058 
4059 } // __kmpc_get_taskid
4060 
4061 kmp_uint64 __kmpc_get_parent_taskid() {
4062 
4063   kmp_int32 gtid;
4064   kmp_info_t *thread;
4065   kmp_taskdata_t *parent_task;
4066 
4067   gtid = __kmp_get_gtid();
4068   if (gtid < 0) {
4069     return 0;
4070   }
4071   thread = __kmp_thread_from_gtid(gtid);
4072   parent_task = thread->th.th_current_task->td_parent;
4073   return (parent_task == NULL ? 0 : parent_task->td_task_id);
4074 
4075 } // __kmpc_get_parent_taskid
4076 
4077 /*!
4078 @ingroup WORK_SHARING
4079 @param loc  source location information.
4080 @param gtid  global thread number.
4081 @param num_dims  number of associated doacross loops.
4082 @param dims  info on loops bounds.
4083 
4084 Initialize doacross loop information.
4085 Expect compiler send us inclusive bounds,
4086 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
4087 */
4088 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
4089                           const struct kmp_dim *dims) {
4090   __kmp_assert_valid_gtid(gtid);
4091   int j, idx;
4092   kmp_int64 last, trace_count;
4093   kmp_info_t *th = __kmp_threads[gtid];
4094   kmp_team_t *team = th->th.th_team;
4095   kmp_uint32 *flags;
4096   kmp_disp_t *pr_buf = th->th.th_dispatch;
4097   dispatch_shared_info_t *sh_buf;
4098 
4099   KA_TRACE(
4100       20,
4101       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
4102        gtid, num_dims, !team->t.t_serialized));
4103   KMP_DEBUG_ASSERT(dims != NULL);
4104   KMP_DEBUG_ASSERT(num_dims > 0);
4105 
4106   if (team->t.t_serialized) {
4107     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
4108     return; // no dependencies if team is serialized
4109   }
4110   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
4111   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
4112   // the next loop
4113   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4114 
4115   // Save bounds info into allocated private buffer
4116   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
4117   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
4118       th, sizeof(kmp_int64) * (4 * num_dims + 1));
4119   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4120   pr_buf->th_doacross_info[0] =
4121       (kmp_int64)num_dims; // first element is number of dimensions
4122   // Save also address of num_done in order to access it later without knowing
4123   // the buffer index
4124   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
4125   pr_buf->th_doacross_info[2] = dims[0].lo;
4126   pr_buf->th_doacross_info[3] = dims[0].up;
4127   pr_buf->th_doacross_info[4] = dims[0].st;
4128   last = 5;
4129   for (j = 1; j < num_dims; ++j) {
4130     kmp_int64
4131         range_length; // To keep ranges of all dimensions but the first dims[0]
4132     if (dims[j].st == 1) { // most common case
4133       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
4134       range_length = dims[j].up - dims[j].lo + 1;
4135     } else {
4136       if (dims[j].st > 0) {
4137         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
4138         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
4139       } else { // negative increment
4140         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
4141         range_length =
4142             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
4143       }
4144     }
4145     pr_buf->th_doacross_info[last++] = range_length;
4146     pr_buf->th_doacross_info[last++] = dims[j].lo;
4147     pr_buf->th_doacross_info[last++] = dims[j].up;
4148     pr_buf->th_doacross_info[last++] = dims[j].st;
4149   }
4150 
4151   // Compute total trip count.
4152   // Start with range of dims[0] which we don't need to keep in the buffer.
4153   if (dims[0].st == 1) { // most common case
4154     trace_count = dims[0].up - dims[0].lo + 1;
4155   } else if (dims[0].st > 0) {
4156     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
4157     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
4158   } else { // negative increment
4159     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
4160     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
4161   }
4162   for (j = 1; j < num_dims; ++j) {
4163     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
4164   }
4165   KMP_DEBUG_ASSERT(trace_count > 0);
4166 
4167   // Check if shared buffer is not occupied by other loop (idx -
4168   // __kmp_dispatch_num_buffers)
4169   if (idx != sh_buf->doacross_buf_idx) {
4170     // Shared buffer is occupied, wait for it to be free
4171     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
4172                  __kmp_eq_4, NULL);
4173   }
4174 #if KMP_32_BIT_ARCH
4175   // Check if we are the first thread. After the CAS the first thread gets 0,
4176   // others get 1 if initialization is in progress, allocated pointer otherwise.
4177   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
4178   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
4179       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
4180 #else
4181   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
4182       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
4183 #endif
4184   if (flags == NULL) {
4185     // we are the first thread, allocate the array of flags
4186     size_t size =
4187         (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
4188     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4189     KMP_MB();
4190     sh_buf->doacross_flags = flags;
4191   } else if (flags == (kmp_uint32 *)1) {
4192 #if KMP_32_BIT_ARCH
4193     // initialization is still in progress, need to wait
4194     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4195 #else
4196     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4197 #endif
4198       KMP_YIELD(TRUE);
4199     KMP_MB();
4200   } else {
4201     KMP_MB();
4202   }
4203   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4204   pr_buf->th_doacross_flags =
4205       sh_buf->doacross_flags; // save private copy in order to not
4206   // touch shared buffer on each iteration
4207   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4208 }
4209 
4210 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4211   __kmp_assert_valid_gtid(gtid);
4212   kmp_int64 shft;
4213   size_t num_dims, i;
4214   kmp_uint32 flag;
4215   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4216   kmp_info_t *th = __kmp_threads[gtid];
4217   kmp_team_t *team = th->th.th_team;
4218   kmp_disp_t *pr_buf;
4219   kmp_int64 lo, up, st;
4220 
4221   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4222   if (team->t.t_serialized) {
4223     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4224     return; // no dependencies if team is serialized
4225   }
4226 
4227   // calculate sequential iteration number and check out-of-bounds condition
4228   pr_buf = th->th.th_dispatch;
4229   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4230   num_dims = (size_t)pr_buf->th_doacross_info[0];
4231   lo = pr_buf->th_doacross_info[2];
4232   up = pr_buf->th_doacross_info[3];
4233   st = pr_buf->th_doacross_info[4];
4234 #if OMPT_SUPPORT && OMPT_OPTIONAL
4235   ompt_dependence_t deps[num_dims];
4236 #endif
4237   if (st == 1) { // most common case
4238     if (vec[0] < lo || vec[0] > up) {
4239       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4240                     "bounds [%lld,%lld]\n",
4241                     gtid, vec[0], lo, up));
4242       return;
4243     }
4244     iter_number = vec[0] - lo;
4245   } else if (st > 0) {
4246     if (vec[0] < lo || vec[0] > up) {
4247       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4248                     "bounds [%lld,%lld]\n",
4249                     gtid, vec[0], lo, up));
4250       return;
4251     }
4252     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4253   } else { // negative increment
4254     if (vec[0] > lo || vec[0] < up) {
4255       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4256                     "bounds [%lld,%lld]\n",
4257                     gtid, vec[0], lo, up));
4258       return;
4259     }
4260     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4261   }
4262 #if OMPT_SUPPORT && OMPT_OPTIONAL
4263   deps[0].variable.value = iter_number;
4264   deps[0].dependence_type = ompt_dependence_type_sink;
4265 #endif
4266   for (i = 1; i < num_dims; ++i) {
4267     kmp_int64 iter, ln;
4268     size_t j = i * 4;
4269     ln = pr_buf->th_doacross_info[j + 1];
4270     lo = pr_buf->th_doacross_info[j + 2];
4271     up = pr_buf->th_doacross_info[j + 3];
4272     st = pr_buf->th_doacross_info[j + 4];
4273     if (st == 1) {
4274       if (vec[i] < lo || vec[i] > up) {
4275         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4276                       "bounds [%lld,%lld]\n",
4277                       gtid, vec[i], lo, up));
4278         return;
4279       }
4280       iter = vec[i] - lo;
4281     } else if (st > 0) {
4282       if (vec[i] < lo || vec[i] > up) {
4283         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4284                       "bounds [%lld,%lld]\n",
4285                       gtid, vec[i], lo, up));
4286         return;
4287       }
4288       iter = (kmp_uint64)(vec[i] - lo) / st;
4289     } else { // st < 0
4290       if (vec[i] > lo || vec[i] < up) {
4291         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4292                       "bounds [%lld,%lld]\n",
4293                       gtid, vec[i], lo, up));
4294         return;
4295       }
4296       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4297     }
4298     iter_number = iter + ln * iter_number;
4299 #if OMPT_SUPPORT && OMPT_OPTIONAL
4300     deps[i].variable.value = iter;
4301     deps[i].dependence_type = ompt_dependence_type_sink;
4302 #endif
4303   }
4304   shft = iter_number % 32; // use 32-bit granularity
4305   iter_number >>= 5; // divided by 32
4306   flag = 1 << shft;
4307   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4308     KMP_YIELD(TRUE);
4309   }
4310   KMP_MB();
4311 #if OMPT_SUPPORT && OMPT_OPTIONAL
4312   if (ompt_enabled.ompt_callback_dependences) {
4313     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4314         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4315   }
4316 #endif
4317   KA_TRACE(20,
4318            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4319             gtid, (iter_number << 5) + shft));
4320 }
4321 
4322 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4323   __kmp_assert_valid_gtid(gtid);
4324   kmp_int64 shft;
4325   size_t num_dims, i;
4326   kmp_uint32 flag;
4327   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4328   kmp_info_t *th = __kmp_threads[gtid];
4329   kmp_team_t *team = th->th.th_team;
4330   kmp_disp_t *pr_buf;
4331   kmp_int64 lo, st;
4332 
4333   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4334   if (team->t.t_serialized) {
4335     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4336     return; // no dependencies if team is serialized
4337   }
4338 
4339   // calculate sequential iteration number (same as in "wait" but no
4340   // out-of-bounds checks)
4341   pr_buf = th->th.th_dispatch;
4342   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4343   num_dims = (size_t)pr_buf->th_doacross_info[0];
4344   lo = pr_buf->th_doacross_info[2];
4345   st = pr_buf->th_doacross_info[4];
4346 #if OMPT_SUPPORT && OMPT_OPTIONAL
4347   ompt_dependence_t deps[num_dims];
4348 #endif
4349   if (st == 1) { // most common case
4350     iter_number = vec[0] - lo;
4351   } else if (st > 0) {
4352     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4353   } else { // negative increment
4354     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4355   }
4356 #if OMPT_SUPPORT && OMPT_OPTIONAL
4357   deps[0].variable.value = iter_number;
4358   deps[0].dependence_type = ompt_dependence_type_source;
4359 #endif
4360   for (i = 1; i < num_dims; ++i) {
4361     kmp_int64 iter, ln;
4362     size_t j = i * 4;
4363     ln = pr_buf->th_doacross_info[j + 1];
4364     lo = pr_buf->th_doacross_info[j + 2];
4365     st = pr_buf->th_doacross_info[j + 4];
4366     if (st == 1) {
4367       iter = vec[i] - lo;
4368     } else if (st > 0) {
4369       iter = (kmp_uint64)(vec[i] - lo) / st;
4370     } else { // st < 0
4371       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4372     }
4373     iter_number = iter + ln * iter_number;
4374 #if OMPT_SUPPORT && OMPT_OPTIONAL
4375     deps[i].variable.value = iter;
4376     deps[i].dependence_type = ompt_dependence_type_source;
4377 #endif
4378   }
4379 #if OMPT_SUPPORT && OMPT_OPTIONAL
4380   if (ompt_enabled.ompt_callback_dependences) {
4381     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4382         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4383   }
4384 #endif
4385   shft = iter_number % 32; // use 32-bit granularity
4386   iter_number >>= 5; // divided by 32
4387   flag = 1 << shft;
4388   KMP_MB();
4389   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4390     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4391   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4392                 (iter_number << 5) + shft));
4393 }
4394 
4395 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4396   __kmp_assert_valid_gtid(gtid);
4397   kmp_int32 num_done;
4398   kmp_info_t *th = __kmp_threads[gtid];
4399   kmp_team_t *team = th->th.th_team;
4400   kmp_disp_t *pr_buf = th->th.th_dispatch;
4401 
4402   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4403   if (team->t.t_serialized) {
4404     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4405     return; // nothing to do
4406   }
4407   num_done =
4408       KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
4409   if (num_done == th->th.th_team_nproc) {
4410     // we are the last thread, need to free shared resources
4411     int idx = pr_buf->th_doacross_buf_idx - 1;
4412     dispatch_shared_info_t *sh_buf =
4413         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4414     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4415                      (kmp_int64)&sh_buf->doacross_num_done);
4416     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4417     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4418     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4419     sh_buf->doacross_flags = NULL;
4420     sh_buf->doacross_num_done = 0;
4421     sh_buf->doacross_buf_idx +=
4422         __kmp_dispatch_num_buffers; // free buffer for future re-use
4423   }
4424   // free private resources (need to keep buffer index forever)
4425   pr_buf->th_doacross_flags = NULL;
4426   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4427   pr_buf->th_doacross_info = NULL;
4428   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4429 }
4430 
4431 /* OpenMP 5.1 Memory Management routines */
4432 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4433   return __kmp_alloc(__kmp_entry_gtid(), 0, size, allocator);
4434 }
4435 
4436 void *omp_aligned_alloc(size_t align, size_t size,
4437                         omp_allocator_handle_t allocator) {
4438   return __kmp_alloc(__kmp_entry_gtid(), align, size, allocator);
4439 }
4440 
4441 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4442   return __kmp_calloc(__kmp_entry_gtid(), 0, nmemb, size, allocator);
4443 }
4444 
4445 void *omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
4446                          omp_allocator_handle_t allocator) {
4447   return __kmp_calloc(__kmp_entry_gtid(), align, nmemb, size, allocator);
4448 }
4449 
4450 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4451                   omp_allocator_handle_t free_allocator) {
4452   return __kmp_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4453                        free_allocator);
4454 }
4455 
4456 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4457   ___kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4458 }
4459 /* end of OpenMP 5.1 Memory Management routines */
4460 
4461 int __kmpc_get_target_offload(void) {
4462   if (!__kmp_init_serial) {
4463     __kmp_serial_initialize();
4464   }
4465   return __kmp_target_offload;
4466 }
4467 
4468 int __kmpc_pause_resource(kmp_pause_status_t level) {
4469   if (!__kmp_init_serial) {
4470     return 1; // Can't pause if runtime is not initialized
4471   }
4472   return __kmp_pause_resource(level);
4473 }
4474 
4475 void __kmpc_error(ident_t *loc, int severity, const char *message) {
4476   if (!__kmp_init_serial)
4477     __kmp_serial_initialize();
4478 
4479   KMP_ASSERT(severity == severity_warning || severity == severity_fatal);
4480 
4481 #if OMPT_SUPPORT
4482   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_error) {
4483     ompt_callbacks.ompt_callback(ompt_callback_error)(
4484         (ompt_severity_t)severity, message, KMP_STRLEN(message),
4485         OMPT_GET_RETURN_ADDRESS(0));
4486   }
4487 #endif // OMPT_SUPPORT
4488 
4489   char *src_loc;
4490   if (loc && loc->psource) {
4491     kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
4492     src_loc =
4493         __kmp_str_format("%s:%d:%d", str_loc.file, str_loc.line, str_loc.col);
4494     __kmp_str_loc_free(&str_loc);
4495   } else {
4496     src_loc = __kmp_str_format("unknown");
4497   }
4498 
4499   if (severity == severity_warning)
4500     KMP_WARNING(UserDirectedWarning, src_loc, message);
4501   else
4502     KMP_FATAL(UserDirectedError, src_loc, message);
4503 
4504   __kmp_str_free(&src_loc);
4505 }
4506 
4507 // Mark begin of scope directive.
4508 void __kmpc_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4509 // reserved is for extension of scope directive and not used.
4510 #if OMPT_SUPPORT && OMPT_OPTIONAL
4511   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4512     kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4513     int tid = __kmp_tid_from_gtid(gtid);
4514     ompt_callbacks.ompt_callback(ompt_callback_work)(
4515         ompt_work_scope, ompt_scope_begin,
4516         &(team->t.ompt_team_info.parallel_data),
4517         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4518         OMPT_GET_RETURN_ADDRESS(0));
4519   }
4520 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4521 }
4522 
4523 // Mark end of scope directive
4524 void __kmpc_end_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4525 // reserved is for extension of scope directive and not used.
4526 #if OMPT_SUPPORT && OMPT_OPTIONAL
4527   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4528     kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4529     int tid = __kmp_tid_from_gtid(gtid);
4530     ompt_callbacks.ompt_callback(ompt_callback_work)(
4531         ompt_work_scope, ompt_scope_end,
4532         &(team->t.ompt_team_info.parallel_data),
4533         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4534         OMPT_GET_RETURN_ADDRESS(0));
4535   }
4536 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4537 }
4538 
4539 #ifdef KMP_USE_VERSION_SYMBOLS
4540 // For GOMP compatibility there are two versions of each omp_* API.
4541 // One is the plain C symbol and one is the Fortran symbol with an appended
4542 // underscore. When we implement a specific ompc_* version of an omp_*
4543 // function, we want the plain GOMP versioned symbol to alias the ompc_* version
4544 // instead of the Fortran versions in kmp_ftn_entry.h
4545 extern "C" {
4546 // Have to undef these from omp.h so they aren't translated into
4547 // their ompc counterparts in the KMP_VERSION_OMPC_SYMBOL macros below
4548 #ifdef omp_set_affinity_format
4549 #undef omp_set_affinity_format
4550 #endif
4551 #ifdef omp_get_affinity_format
4552 #undef omp_get_affinity_format
4553 #endif
4554 #ifdef omp_display_affinity
4555 #undef omp_display_affinity
4556 #endif
4557 #ifdef omp_capture_affinity
4558 #undef omp_capture_affinity
4559 #endif
4560 KMP_VERSION_OMPC_SYMBOL(ompc_set_affinity_format, omp_set_affinity_format, 50,
4561                         "OMP_5.0");
4562 KMP_VERSION_OMPC_SYMBOL(ompc_get_affinity_format, omp_get_affinity_format, 50,
4563                         "OMP_5.0");
4564 KMP_VERSION_OMPC_SYMBOL(ompc_display_affinity, omp_display_affinity, 50,
4565                         "OMP_5.0");
4566 KMP_VERSION_OMPC_SYMBOL(ompc_capture_affinity, omp_capture_affinity, 50,
4567                         "OMP_5.0");
4568 } // extern "C"
4569 #endif
4570