xref: /llvm-project/openmp/runtime/src/z_Linux_asm.S (revision 90a05f32166c4a45224a5eedbec9c5c7e21d2dbf)
1//  z_Linux_asm.S:  - microtasking routines specifically
2//                    written for Intel platforms running Linux* OS
3
4//
5////===----------------------------------------------------------------------===//
6////
7//// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8//// See https://llvm.org/LICENSE.txt for license information.
9//// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10////
11////===----------------------------------------------------------------------===//
12//
13
14// -----------------------------------------------------------------------
15// macros
16// -----------------------------------------------------------------------
17
18#include "kmp_config.h"
19
20#if KMP_ARCH_X86 || KMP_ARCH_X86_64
21
22# if defined(__ELF__) && defined(__CET__) && defined(__has_include)
23# if __has_include(<cet.h>)
24# include <cet.h>
25# endif
26# endif
27
28# if !defined(_CET_ENDBR)
29# define _CET_ENDBR
30# endif
31
32# if KMP_MIC
33// the 'delay r16/r32/r64' should be used instead of the 'pause'.
34// The delay operation has the effect of removing the current thread from
35// the round-robin HT mechanism, and therefore speeds up the issue rate of
36// the other threads on the same core.
37//
38// A value of 0 works fine for <= 2 threads per core, but causes the EPCC
39// barrier time to increase greatly for 3 or more threads per core.
40//
41// A value of 100 works pretty well for up to 4 threads per core, but isn't
42// quite as fast as 0 for 2 threads per core.
43//
44// We need to check what happens for oversubscription / > 4 threads per core.
45// It is possible that we need to pass the delay value in as a parameter
46// that the caller determines based on the total # threads / # cores.
47//
48//.macro pause_op
49//	mov    $100, %rax
50//	delay  %rax
51//.endm
52# else
53#  define pause_op   .byte 0xf3,0x90
54# endif // KMP_MIC
55
56# if KMP_OS_DARWIN
57#  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
58#  define KMP_LABEL(x) L_##x             // form the name of label
59.macro KMP_CFI_DEF_OFFSET
60.endmacro
61.macro KMP_CFI_OFFSET
62.endmacro
63.macro KMP_CFI_REGISTER
64.endmacro
65.macro KMP_CFI_DEF
66.endmacro
67.macro ALIGN
68	.align $0
69.endmacro
70.macro DEBUG_INFO
71/* Not sure what .size does in icc, not sure if we need to do something
72   similar for OS X*.
73*/
74.endmacro
75.macro PROC
76	ALIGN  4
77	.globl KMP_PREFIX_UNDERSCORE($0)
78KMP_PREFIX_UNDERSCORE($0):
79	_CET_ENDBR
80.endmacro
81# else // KMP_OS_DARWIN
82#  define KMP_PREFIX_UNDERSCORE(x) x //no extra underscore for Linux* OS symbols
83// Format labels so that they don't override function names in gdb's backtraces
84// MIC assembler doesn't accept .L syntax, the L works fine there (as well as
85// on OS X*)
86# if KMP_MIC
87#  define KMP_LABEL(x) L_##x          // local label
88# else
89#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
90# endif // KMP_MIC
91.macro ALIGN size
92	.align 1<<(\size)
93.endm
94.macro DEBUG_INFO proc
95	.cfi_endproc
96// Not sure why we need .type and .size for the functions
97	.align 16
98	.type  \proc,@function
99        .size  \proc,.-\proc
100.endm
101.macro PROC proc
102	ALIGN  4
103        .globl KMP_PREFIX_UNDERSCORE(\proc)
104KMP_PREFIX_UNDERSCORE(\proc):
105	.cfi_startproc
106	_CET_ENDBR
107.endm
108.macro KMP_CFI_DEF_OFFSET sz
109	.cfi_def_cfa_offset	\sz
110.endm
111.macro KMP_CFI_OFFSET reg, sz
112	.cfi_offset	\reg,\sz
113.endm
114.macro KMP_CFI_REGISTER reg
115	.cfi_def_cfa_register	\reg
116.endm
117.macro KMP_CFI_DEF reg, sz
118	.cfi_def_cfa	\reg,\sz
119.endm
120# endif // KMP_OS_DARWIN
121#endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
122
123#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM)
124
125# if KMP_OS_DARWIN
126#  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
127#  define KMP_LABEL(x) L_##x             // form the name of label
128
129.macro ALIGN
130	.align $0
131.endmacro
132
133.macro DEBUG_INFO
134/* Not sure what .size does in icc, not sure if we need to do something
135   similar for OS X*.
136*/
137.endmacro
138
139.macro PROC
140	ALIGN  4
141	.globl KMP_PREFIX_UNDERSCORE($0)
142KMP_PREFIX_UNDERSCORE($0):
143.endmacro
144# elif KMP_OS_WINDOWS
145#  define KMP_PREFIX_UNDERSCORE(x) x  // no extra underscore for Windows/ARM64 symbols
146// Format labels so that they don't override function names in gdb's backtraces
147#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
148
149.macro ALIGN size
150	.align 1<<(\size)
151.endm
152
153.macro DEBUG_INFO proc
154	ALIGN 2
155.endm
156
157.macro PROC proc
158	ALIGN 2
159	.globl KMP_PREFIX_UNDERSCORE(\proc)
160KMP_PREFIX_UNDERSCORE(\proc):
161.endm
162# else // KMP_OS_DARWIN || KMP_OS_WINDOWS
163#  define KMP_PREFIX_UNDERSCORE(x) x  // no extra underscore for Linux* OS symbols
164// Format labels so that they don't override function names in gdb's backtraces
165#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
166
167.macro ALIGN size
168	.align 1<<(\size)
169.endm
170
171.macro DEBUG_INFO proc
172	.cfi_endproc
173// Not sure why we need .type and .size for the functions
174	ALIGN 2
175#if KMP_ARCH_ARM
176	.type  \proc,%function
177#else
178	.type  \proc,@function
179#endif
180	.size  \proc,.-\proc
181.endm
182
183.macro PROC proc
184	ALIGN 2
185	.globl KMP_PREFIX_UNDERSCORE(\proc)
186KMP_PREFIX_UNDERSCORE(\proc):
187	.cfi_startproc
188.endm
189# endif // KMP_OS_DARWIN
190
191# if KMP_OS_LINUX
192// BTI and PAC gnu property note
193#  define NT_GNU_PROPERTY_TYPE_0 5
194#  define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
195#  define GNU_PROPERTY_AARCH64_FEATURE_1_BTI 1
196#  define GNU_PROPERTY_AARCH64_FEATURE_1_PAC 2
197
198#  define GNU_PROPERTY(type, value)                                            \
199  .pushsection .note.gnu.property, "a";                                        \
200  .p2align 3;                                                                  \
201  .word 4;                                                                     \
202  .word 16;                                                                    \
203  .word NT_GNU_PROPERTY_TYPE_0;                                                \
204  .asciz "GNU";                                                                \
205  .word type;                                                                  \
206  .word 4;                                                                     \
207  .word value;                                                                 \
208  .word 0;                                                                     \
209  .popsection
210# endif
211
212# if defined(__ARM_FEATURE_BTI_DEFAULT)
213#  define BTI_FLAG GNU_PROPERTY_AARCH64_FEATURE_1_BTI
214# else
215#  define BTI_FLAG 0
216# endif
217# if __ARM_FEATURE_PAC_DEFAULT & 3
218#  define PAC_FLAG GNU_PROPERTY_AARCH64_FEATURE_1_PAC
219# else
220#  define PAC_FLAG 0
221# endif
222
223# if (BTI_FLAG | PAC_FLAG) != 0
224#  if PAC_FLAG != 0
225#   define PACBTI_C hint #25
226#   define PACBTI_RET hint #29
227#  else
228#   define PACBTI_C hint #34
229#   define PACBTI_RET
230#  endif
231#  define GNU_PROPERTY_BTI_PAC \
232    GNU_PROPERTY(GNU_PROPERTY_AARCH64_FEATURE_1_AND, BTI_FLAG | PAC_FLAG)
233# else
234#  define PACBTI_C
235#  define PACBTI_RET
236#  define GNU_PROPERTY_BTI_PAC
237# endif
238#endif // (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM)
239
240.macro COMMON name, size, align_power
241#if KMP_OS_DARWIN
242	.comm \name, \size
243#elif KMP_OS_WINDOWS
244	.comm \name, \size, \align_power
245#else // !KMP_OS_DARWIN && !KMP_OS_WINDOWS
246	.comm \name, \size, (1<<(\align_power))
247#endif
248.endm
249
250// -----------------------------------------------------------------------
251// data
252// -----------------------------------------------------------------------
253
254#ifdef KMP_GOMP_COMPAT
255
256// Support for unnamed common blocks.
257//
258// Because the symbol ".gomp_critical_user_" contains a ".", we have to
259// put this stuff in assembly.
260
261# if KMP_ARCH_X86
262#  if KMP_OS_DARWIN
263        .data
264        .comm .gomp_critical_user_,32
265        .data
266        .globl ___kmp_unnamed_critical_addr
267___kmp_unnamed_critical_addr:
268        .long .gomp_critical_user_
269#  else /* Linux* OS */
270        .data
271        .comm .gomp_critical_user_,32,8
272        .data
273	ALIGN 4
274        .global __kmp_unnamed_critical_addr
275__kmp_unnamed_critical_addr:
276        .4byte .gomp_critical_user_
277        .type __kmp_unnamed_critical_addr,@object
278        .size __kmp_unnamed_critical_addr,4
279#  endif /* KMP_OS_DARWIN */
280# endif /* KMP_ARCH_X86 */
281
282# if KMP_ARCH_X86_64
283#  if KMP_OS_DARWIN
284        .data
285        .comm .gomp_critical_user_,32
286        .data
287        .globl ___kmp_unnamed_critical_addr
288___kmp_unnamed_critical_addr:
289        .quad .gomp_critical_user_
290#  else /* Linux* OS */
291        .data
292        .comm .gomp_critical_user_,32,8
293        .data
294	ALIGN 8
295        .global __kmp_unnamed_critical_addr
296__kmp_unnamed_critical_addr:
297        .8byte .gomp_critical_user_
298        .type __kmp_unnamed_critical_addr,@object
299        .size __kmp_unnamed_critical_addr,8
300#  endif /* KMP_OS_DARWIN */
301# endif /* KMP_ARCH_X86_64 */
302
303#endif /* KMP_GOMP_COMPAT */
304
305
306#if KMP_ARCH_X86 && !KMP_ARCH_PPC64
307
308// -----------------------------------------------------------------------
309// microtasking routines specifically written for IA-32 architecture
310// running Linux* OS
311// -----------------------------------------------------------------------
312
313	.ident "Intel Corporation"
314	.data
315	ALIGN 4
316// void
317// __kmp_x86_pause( void );
318
319        .text
320	PROC  __kmp_x86_pause
321
322        pause_op
323        ret
324
325	DEBUG_INFO __kmp_x86_pause
326
327# if !KMP_ASM_INTRINS
328
329//------------------------------------------------------------------------
330// kmp_int32
331// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
332
333        PROC      __kmp_test_then_add32
334
335        movl      4(%esp), %ecx
336        movl      8(%esp), %eax
337        lock
338        xaddl     %eax,(%ecx)
339        ret
340
341	DEBUG_INFO __kmp_test_then_add32
342
343//------------------------------------------------------------------------
344// FUNCTION __kmp_xchg_fixed8
345//
346// kmp_int32
347// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
348//
349// parameters:
350// 	p:	4(%esp)
351// 	d:	8(%esp)
352//
353// return:	%al
354        PROC  __kmp_xchg_fixed8
355
356        movl      4(%esp), %ecx    // "p"
357        movb      8(%esp), %al	// "d"
358
359        lock
360        xchgb     %al,(%ecx)
361        ret
362
363        DEBUG_INFO __kmp_xchg_fixed8
364
365
366//------------------------------------------------------------------------
367// FUNCTION __kmp_xchg_fixed16
368//
369// kmp_int16
370// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
371//
372// parameters:
373// 	p:	4(%esp)
374// 	d:	8(%esp)
375// return:     %ax
376        PROC  __kmp_xchg_fixed16
377
378        movl      4(%esp), %ecx    // "p"
379        movw      8(%esp), %ax	// "d"
380
381        lock
382        xchgw     %ax,(%ecx)
383        ret
384
385        DEBUG_INFO __kmp_xchg_fixed16
386
387
388//------------------------------------------------------------------------
389// FUNCTION __kmp_xchg_fixed32
390//
391// kmp_int32
392// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
393//
394// parameters:
395// 	p:	4(%esp)
396// 	d:	8(%esp)
397//
398// return:	%eax
399        PROC  __kmp_xchg_fixed32
400
401        movl      4(%esp), %ecx    // "p"
402        movl      8(%esp), %eax	// "d"
403
404        lock
405        xchgl     %eax,(%ecx)
406        ret
407
408        DEBUG_INFO __kmp_xchg_fixed32
409
410
411// kmp_int8
412// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
413        PROC  __kmp_compare_and_store8
414
415        movl      4(%esp), %ecx
416        movb      8(%esp), %al
417        movb      12(%esp), %dl
418        lock
419        cmpxchgb  %dl,(%ecx)
420        sete      %al           // if %al == (%ecx) set %al = 1 else set %al = 0
421        and       $1, %eax      // sign extend previous instruction
422        ret
423
424        DEBUG_INFO __kmp_compare_and_store8
425
426// kmp_int16
427// __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv);
428        PROC  __kmp_compare_and_store16
429
430        movl      4(%esp), %ecx
431        movw      8(%esp), %ax
432        movw      12(%esp), %dx
433        lock
434        cmpxchgw  %dx,(%ecx)
435        sete      %al           // if %ax == (%ecx) set %al = 1 else set %al = 0
436        and       $1, %eax      // sign extend previous instruction
437        ret
438
439        DEBUG_INFO __kmp_compare_and_store16
440
441// kmp_int32
442// __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv);
443        PROC  __kmp_compare_and_store32
444
445        movl      4(%esp), %ecx
446        movl      8(%esp), %eax
447        movl      12(%esp), %edx
448        lock
449        cmpxchgl  %edx,(%ecx)
450        sete      %al          // if %eax == (%ecx) set %al = 1 else set %al = 0
451        and       $1, %eax     // sign extend previous instruction
452        ret
453
454        DEBUG_INFO __kmp_compare_and_store32
455
456// kmp_int32
457// __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 s );
458        PROC  __kmp_compare_and_store64
459
460        pushl     %ebp
461        movl      %esp, %ebp
462        pushl     %ebx
463        pushl     %edi
464        movl      8(%ebp), %edi
465        movl      12(%ebp), %eax        // "cv" low order word
466        movl      16(%ebp), %edx        // "cv" high order word
467        movl      20(%ebp), %ebx        // "sv" low order word
468        movl      24(%ebp), %ecx        // "sv" high order word
469        lock
470        cmpxchg8b (%edi)
471        sete      %al      // if %edx:eax == (%edi) set %al = 1 else set %al = 0
472        and       $1, %eax // sign extend previous instruction
473        popl      %edi
474        popl      %ebx
475        movl      %ebp, %esp
476        popl      %ebp
477        ret
478
479        DEBUG_INFO __kmp_compare_and_store64
480
481// kmp_int8
482// __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv);
483        PROC  __kmp_compare_and_store_ret8
484
485        movl      4(%esp), %ecx
486        movb      8(%esp), %al
487        movb      12(%esp), %dl
488        lock
489        cmpxchgb  %dl,(%ecx)
490        ret
491
492        DEBUG_INFO __kmp_compare_and_store_ret8
493
494// kmp_int16
495// __kmp_compare_and_store_ret16(volatile kmp_int16 *p, kmp_int16 cv,
496//                               kmp_int16 sv);
497        PROC  __kmp_compare_and_store_ret16
498
499        movl      4(%esp), %ecx
500        movw      8(%esp), %ax
501        movw      12(%esp), %dx
502        lock
503        cmpxchgw  %dx,(%ecx)
504        ret
505
506        DEBUG_INFO __kmp_compare_and_store_ret16
507
508// kmp_int32
509// __kmp_compare_and_store_ret32(volatile kmp_int32 *p, kmp_int32 cv,
510//                               kmp_int32 sv);
511        PROC  __kmp_compare_and_store_ret32
512
513        movl      4(%esp), %ecx
514        movl      8(%esp), %eax
515        movl      12(%esp), %edx
516        lock
517        cmpxchgl  %edx,(%ecx)
518        ret
519
520        DEBUG_INFO __kmp_compare_and_store_ret32
521
522// kmp_int64
523// __kmp_compare_and_store_ret64(volatile kmp_int64 *p, kmp_int64 cv,
524//                               kmp_int64 sv);
525        PROC  __kmp_compare_and_store_ret64
526
527        pushl     %ebp
528        movl      %esp, %ebp
529        pushl     %ebx
530        pushl     %edi
531        movl      8(%ebp), %edi
532        movl      12(%ebp), %eax        // "cv" low order word
533        movl      16(%ebp), %edx        // "cv" high order word
534        movl      20(%ebp), %ebx        // "sv" low order word
535        movl      24(%ebp), %ecx        // "sv" high order word
536        lock
537        cmpxchg8b (%edi)
538        popl      %edi
539        popl      %ebx
540        movl      %ebp, %esp
541        popl      %ebp
542        ret
543
544        DEBUG_INFO __kmp_compare_and_store_ret64
545
546
547//------------------------------------------------------------------------
548// FUNCTION __kmp_xchg_real32
549//
550// kmp_real32
551// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
552//
553// parameters:
554// 	addr:	4(%esp)
555// 	data:	8(%esp)
556//
557// return:	%eax
558        PROC  __kmp_xchg_real32
559
560        pushl   %ebp
561        movl    %esp, %ebp
562        subl    $4, %esp
563        pushl   %esi
564
565        movl    4(%ebp), %esi
566        flds    (%esi)
567                        // load <addr>
568        fsts    -4(%ebp)
569                        // store old value
570
571        movl    8(%ebp), %eax
572
573        lock
574        xchgl   %eax, (%esi)
575
576        flds    -4(%ebp)
577                        // return old value
578
579        popl    %esi
580        movl    %ebp, %esp
581        popl    %ebp
582        ret
583
584        DEBUG_INFO __kmp_xchg_real32
585
586# endif /* !KMP_ASM_INTRINS */
587
588//------------------------------------------------------------------------
589// int
590// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
591//                         int gtid, int tid,
592//                         int argc, void *p_argv[]
593// #if OMPT_SUPPORT
594//                         ,
595//                         void **exit_frame_ptr
596// #endif
597//                       ) {
598// #if OMPT_SUPPORT
599//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
600// #endif
601//
602//   (*pkfn)( & gtid, & tid, argv[0], ... );
603//   return 1;
604// }
605
606// -- Begin __kmp_invoke_microtask
607// mark_begin;
608	PROC  __kmp_invoke_microtask
609
610	pushl %ebp
611	KMP_CFI_DEF_OFFSET 8
612	KMP_CFI_OFFSET ebp,-8
613	movl %esp,%ebp		// establish the base pointer for this routine.
614	KMP_CFI_REGISTER ebp
615	subl $8,%esp		// allocate space for two local variables.
616				// These varibales are:
617				//	argv: -4(%ebp)
618				//	temp: -8(%ebp)
619				//
620	pushl %ebx		// save %ebx to use during this routine
621				//
622#if OMPT_SUPPORT
623	movl 28(%ebp),%ebx	// get exit_frame address
624	movl %ebp,(%ebx)	// save exit_frame
625#endif
626
627	movl 20(%ebp),%ebx	// Stack alignment - # args
628	addl $2,%ebx		// #args +2  Always pass at least 2 args (gtid and tid)
629	shll $2,%ebx		// Number of bytes used on stack: (#args+2)*4
630	movl %esp,%eax		//
631	subl %ebx,%eax		// %esp-((#args+2)*4) -> %eax -- without mods, stack ptr would be this
632	movl %eax,%ebx		// Save to %ebx
633	andl $0xFFFFFF80,%eax	// mask off 7 bits
634	subl %eax,%ebx		// Amount to subtract from %esp
635	subl %ebx,%esp		// Prepare the stack ptr --
636				//   now it will be aligned on 128-byte boundary at the call
637
638	movl 24(%ebp),%eax	// copy from p_argv[]
639	movl %eax,-4(%ebp)	// into the local variable *argv.
640
641	movl 20(%ebp),%ebx	// argc is 20(%ebp)
642	shll $2,%ebx
643
644KMP_LABEL(invoke_2):
645	cmpl $0,%ebx
646	jg  KMP_LABEL(invoke_4)
647	jmp KMP_LABEL(invoke_3)
648	ALIGN 2
649KMP_LABEL(invoke_4):
650	movl -4(%ebp),%eax
651	subl $4,%ebx			// decrement argc.
652	addl %ebx,%eax			// index into argv.
653	movl (%eax),%edx
654	pushl %edx
655
656	jmp KMP_LABEL(invoke_2)
657	ALIGN 2
658KMP_LABEL(invoke_3):
659	leal 16(%ebp),%eax		// push & tid
660	pushl %eax
661
662	leal 12(%ebp),%eax		// push & gtid
663	pushl %eax
664
665	movl 8(%ebp),%ebx
666	call *%ebx			// call (*pkfn)();
667
668	movl $1,%eax			// return 1;
669
670	movl -12(%ebp),%ebx		// restore %ebx
671	leave
672	KMP_CFI_DEF esp,4
673	ret
674
675	DEBUG_INFO __kmp_invoke_microtask
676// -- End  __kmp_invoke_microtask
677
678
679// kmp_uint64
680// __kmp_hardware_timestamp(void)
681	PROC  __kmp_hardware_timestamp
682	rdtsc
683	ret
684
685	DEBUG_INFO __kmp_hardware_timestamp
686// -- End  __kmp_hardware_timestamp
687
688#endif /* KMP_ARCH_X86 */
689
690
691#if KMP_ARCH_X86_64
692
693// -----------------------------------------------------------------------
694// microtasking routines specifically written for IA-32 architecture and
695// Intel(R) 64 running Linux* OS
696// -----------------------------------------------------------------------
697
698// -- Machine type P
699// mark_description "Intel Corporation";
700	.ident "Intel Corporation"
701// --	.file "z_Linux_asm.S"
702	.data
703	ALIGN 4
704
705// To prevent getting our code into .data section .text added to every routine
706// definition for x86_64.
707//------------------------------------------------------------------------
708# if !KMP_ASM_INTRINS
709
710//------------------------------------------------------------------------
711// FUNCTION __kmp_test_then_add32
712//
713// kmp_int32
714// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
715//
716// parameters:
717// 	p:	%rdi
718// 	d:	%esi
719//
720// return:	%eax
721        .text
722        PROC  __kmp_test_then_add32
723
724        movl      %esi, %eax	// "d"
725        lock
726        xaddl     %eax,(%rdi)
727        ret
728
729        DEBUG_INFO __kmp_test_then_add32
730
731
732//------------------------------------------------------------------------
733// FUNCTION __kmp_test_then_add64
734//
735// kmp_int64
736// __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d );
737//
738// parameters:
739// 	p:	%rdi
740// 	d:	%rsi
741//	return:	%rax
742        .text
743        PROC  __kmp_test_then_add64
744
745        movq      %rsi, %rax	// "d"
746        lock
747        xaddq     %rax,(%rdi)
748        ret
749
750        DEBUG_INFO __kmp_test_then_add64
751
752
753//------------------------------------------------------------------------
754// FUNCTION __kmp_xchg_fixed8
755//
756// kmp_int32
757// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
758//
759// parameters:
760// 	p:	%rdi
761// 	d:	%sil
762//
763// return:	%al
764        .text
765        PROC  __kmp_xchg_fixed8
766
767        movb      %sil, %al	// "d"
768
769        lock
770        xchgb     %al,(%rdi)
771        ret
772
773        DEBUG_INFO __kmp_xchg_fixed8
774
775
776//------------------------------------------------------------------------
777// FUNCTION __kmp_xchg_fixed16
778//
779// kmp_int16
780// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
781//
782// parameters:
783// 	p:	%rdi
784// 	d:	%si
785// return:     %ax
786        .text
787        PROC  __kmp_xchg_fixed16
788
789        movw      %si, %ax	// "d"
790
791        lock
792        xchgw     %ax,(%rdi)
793        ret
794
795        DEBUG_INFO __kmp_xchg_fixed16
796
797
798//------------------------------------------------------------------------
799// FUNCTION __kmp_xchg_fixed32
800//
801// kmp_int32
802// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
803//
804// parameters:
805// 	p:	%rdi
806// 	d:	%esi
807//
808// return:	%eax
809        .text
810        PROC  __kmp_xchg_fixed32
811
812        movl      %esi, %eax	// "d"
813
814        lock
815        xchgl     %eax,(%rdi)
816        ret
817
818        DEBUG_INFO __kmp_xchg_fixed32
819
820
821//------------------------------------------------------------------------
822// FUNCTION __kmp_xchg_fixed64
823//
824// kmp_int64
825// __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 d );
826//
827// parameters:
828// 	p:	%rdi
829// 	d:	%rsi
830// return:	%rax
831        .text
832        PROC  __kmp_xchg_fixed64
833
834        movq      %rsi, %rax	// "d"
835
836        lock
837        xchgq     %rax,(%rdi)
838        ret
839
840        DEBUG_INFO __kmp_xchg_fixed64
841
842
843//------------------------------------------------------------------------
844// FUNCTION __kmp_compare_and_store8
845//
846// kmp_int8
847// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
848//
849// parameters:
850// 	p:	%rdi
851// 	cv:	%esi
852//	sv:	%edx
853//
854// return:	%eax
855        .text
856        PROC  __kmp_compare_and_store8
857
858        movb      %sil, %al	// "cv"
859        lock
860        cmpxchgb  %dl,(%rdi)
861        sete      %al           // if %al == (%rdi) set %al = 1 else set %al = 0
862        andq      $1, %rax      // sign extend previous instruction for return value
863        ret
864
865        DEBUG_INFO __kmp_compare_and_store8
866
867
868//------------------------------------------------------------------------
869// FUNCTION __kmp_compare_and_store16
870//
871// kmp_int16
872// __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
873//
874// parameters:
875// 	p:	%rdi
876// 	cv:	%si
877//	sv:	%dx
878//
879// return:	%eax
880        .text
881        PROC  __kmp_compare_and_store16
882
883        movw      %si, %ax	// "cv"
884        lock
885        cmpxchgw  %dx,(%rdi)
886        sete      %al           // if %ax == (%rdi) set %al = 1 else set %al = 0
887        andq      $1, %rax      // sign extend previous instruction for return value
888        ret
889
890        DEBUG_INFO __kmp_compare_and_store16
891
892
893//------------------------------------------------------------------------
894// FUNCTION __kmp_compare_and_store32
895//
896// kmp_int32
897// __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
898//
899// parameters:
900// 	p:	%rdi
901// 	cv:	%esi
902//	sv:	%edx
903//
904// return:	%eax
905        .text
906        PROC  __kmp_compare_and_store32
907
908        movl      %esi, %eax	// "cv"
909        lock
910        cmpxchgl  %edx,(%rdi)
911        sete      %al           // if %eax == (%rdi) set %al = 1 else set %al = 0
912        andq      $1, %rax      // sign extend previous instruction for return value
913        ret
914
915        DEBUG_INFO __kmp_compare_and_store32
916
917
918//------------------------------------------------------------------------
919// FUNCTION __kmp_compare_and_store64
920//
921// kmp_int32
922// __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
923//
924// parameters:
925// 	p:	%rdi
926// 	cv:	%rsi
927//	sv:	%rdx
928//	return:	%eax
929        .text
930        PROC  __kmp_compare_and_store64
931
932        movq      %rsi, %rax    // "cv"
933        lock
934        cmpxchgq  %rdx,(%rdi)
935        sete      %al           // if %rax == (%rdi) set %al = 1 else set %al = 0
936        andq      $1, %rax      // sign extend previous instruction for return value
937        ret
938
939        DEBUG_INFO __kmp_compare_and_store64
940
941//------------------------------------------------------------------------
942// FUNCTION __kmp_compare_and_store_ret8
943//
944// kmp_int8
945// __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
946//
947// parameters:
948// 	p:	%rdi
949// 	cv:	%esi
950//	sv:	%edx
951//
952// return:	%eax
953        .text
954        PROC  __kmp_compare_and_store_ret8
955
956        movb      %sil, %al	// "cv"
957        lock
958        cmpxchgb  %dl,(%rdi)
959        ret
960
961        DEBUG_INFO __kmp_compare_and_store_ret8
962
963
964//------------------------------------------------------------------------
965// FUNCTION __kmp_compare_and_store_ret16
966//
967// kmp_int16
968// __kmp_compare_and_store16_ret( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
969//
970// parameters:
971// 	p:	%rdi
972// 	cv:	%si
973//	sv:	%dx
974//
975// return:	%eax
976        .text
977        PROC  __kmp_compare_and_store_ret16
978
979        movw      %si, %ax	// "cv"
980        lock
981        cmpxchgw  %dx,(%rdi)
982        ret
983
984        DEBUG_INFO __kmp_compare_and_store_ret16
985
986
987//------------------------------------------------------------------------
988// FUNCTION __kmp_compare_and_store_ret32
989//
990// kmp_int32
991// __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
992//
993// parameters:
994// 	p:	%rdi
995// 	cv:	%esi
996//	sv:	%edx
997//
998// return:	%eax
999        .text
1000        PROC  __kmp_compare_and_store_ret32
1001
1002        movl      %esi, %eax	// "cv"
1003        lock
1004        cmpxchgl  %edx,(%rdi)
1005        ret
1006
1007        DEBUG_INFO __kmp_compare_and_store_ret32
1008
1009
1010//------------------------------------------------------------------------
1011// FUNCTION __kmp_compare_and_store_ret64
1012//
1013// kmp_int64
1014// __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
1015//
1016// parameters:
1017// 	p:	%rdi
1018// 	cv:	%rsi
1019//	sv:	%rdx
1020//	return:	%eax
1021        .text
1022        PROC  __kmp_compare_and_store_ret64
1023
1024        movq      %rsi, %rax    // "cv"
1025        lock
1026        cmpxchgq  %rdx,(%rdi)
1027        ret
1028
1029        DEBUG_INFO __kmp_compare_and_store_ret64
1030
1031# endif /* !KMP_ASM_INTRINS */
1032
1033
1034# if !KMP_MIC
1035
1036# if !KMP_ASM_INTRINS
1037
1038//------------------------------------------------------------------------
1039// FUNCTION __kmp_xchg_real32
1040//
1041// kmp_real32
1042// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
1043//
1044// parameters:
1045// 	addr:	%rdi
1046// 	data:	%xmm0 (lower 4 bytes)
1047//
1048// return:	%xmm0 (lower 4 bytes)
1049        .text
1050        PROC  __kmp_xchg_real32
1051
1052	movd	%xmm0, %eax	// load "data" to eax
1053
1054         lock
1055         xchgl %eax, (%rdi)
1056
1057	movd	%eax, %xmm0	// load old value into return register
1058
1059        ret
1060
1061        DEBUG_INFO __kmp_xchg_real32
1062
1063
1064//------------------------------------------------------------------------
1065// FUNCTION __kmp_xchg_real64
1066//
1067// kmp_real64
1068// __kmp_xchg_real64( volatile kmp_real64 *addr, kmp_real64 data );
1069//
1070// parameters:
1071//      addr:   %rdi
1072//      data:   %xmm0 (lower 8 bytes)
1073//      return: %xmm0 (lower 8 bytes)
1074        .text
1075        PROC  __kmp_xchg_real64
1076
1077	movd	%xmm0, %rax	// load "data" to rax
1078
1079         lock
1080	xchgq  %rax, (%rdi)
1081
1082	movd	%rax, %xmm0	// load old value into return register
1083        ret
1084
1085        DEBUG_INFO __kmp_xchg_real64
1086
1087
1088# endif /* !KMP_MIC */
1089
1090# endif /* !KMP_ASM_INTRINS */
1091
1092//------------------------------------------------------------------------
1093// int
1094// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1095//                         int gtid, int tid,
1096//                         int argc, void *p_argv[]
1097// #if OMPT_SUPPORT
1098//                         ,
1099//                         void **exit_frame_ptr
1100// #endif
1101//                       ) {
1102// #if OMPT_SUPPORT
1103//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1104// #endif
1105//
1106//   (*pkfn)( & gtid, & tid, argv[0], ... );
1107//   return 1;
1108// }
1109//
1110// note: at call to pkfn must have %rsp 128-byte aligned for compiler
1111//
1112// parameters:
1113//      %rdi:  	pkfn
1114//	%esi:	gtid
1115//	%edx:	tid
1116//	%ecx:	argc
1117//	%r8:	p_argv
1118//	%r9:	&exit_frame
1119//
1120// locals:
1121//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
1122//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
1123//
1124// reg temps:
1125//	%rax:	used all over the place
1126//	%rdx:	used in stack pointer alignment calculation
1127//	%r11:	used to traverse p_argv array
1128//	%rsi:	used as temporary for stack parameters
1129//		used as temporary for number of pkfn parms to push
1130//	%rbx:	used to hold pkfn address, and zero constant, callee-save
1131//
1132// return:	%eax 	(always 1/TRUE)
1133__gtid = -16
1134__tid = -24
1135
1136// -- Begin __kmp_invoke_microtask
1137// mark_begin;
1138        .text
1139	PROC  __kmp_invoke_microtask
1140
1141	pushq 	%rbp		// save base pointer
1142	KMP_CFI_DEF_OFFSET 16
1143	KMP_CFI_OFFSET rbp,-16
1144	movq 	%rsp,%rbp	// establish the base pointer for this routine.
1145	KMP_CFI_REGISTER rbp
1146
1147#if OMPT_SUPPORT
1148	movq	%rbp, (%r9)	// save exit_frame
1149#endif
1150
1151	pushq 	%rbx		// %rbx is callee-saved register
1152	pushq	%rsi		// Put gtid on stack so can pass &tgid to pkfn
1153	pushq	%rdx		// Put tid on stack so can pass &tid to pkfn
1154
1155	movq	%rcx, %rax	// Stack alignment calculation begins; argc -> %rax
1156	movq	$0, %rbx	// constant for cmovs later
1157	subq	$4, %rax	// subtract four args passed in registers to pkfn
1158#if KMP_MIC
1159	js	KMP_LABEL(kmp_0)	// jump to movq
1160	jmp	KMP_LABEL(kmp_0_exit)	// jump ahead
1161KMP_LABEL(kmp_0):
1162	movq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
1163KMP_LABEL(kmp_0_exit):
1164#else
1165	cmovsq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
1166#endif // KMP_MIC
1167
1168	movq	%rax, %rsi	// save max(0, argc-4) -> %rsi for later
1169	shlq 	$3, %rax	// Number of bytes used on stack: max(0, argc-4)*8
1170
1171	movq 	%rsp, %rdx	//
1172	subq 	%rax, %rdx	// %rsp-(max(0,argc-4)*8) -> %rdx --
1173				// without align, stack ptr would be this
1174	movq 	%rdx, %rax	// Save to %rax
1175
1176	andq 	$0xFFFFFFFFFFFFFF80, %rax  // mask off lower 7 bits (128 bytes align)
1177	subq 	%rax, %rdx	// Amount to subtract from %rsp
1178	subq 	%rdx, %rsp	// Prepare the stack ptr --
1179				// now %rsp will align to 128-byte boundary at call site
1180
1181				// setup pkfn parameter reg and stack
1182	movq	%rcx, %rax	// argc -> %rax
1183	cmpq	$0, %rsi
1184	je	KMP_LABEL(kmp_invoke_pass_parms)	// jump ahead if no parms to push
1185	shlq	$3, %rcx	// argc*8 -> %rcx
1186	movq 	%r8, %rdx	// p_argv -> %rdx
1187	addq	%rcx, %rdx	// &p_argv[argc] -> %rdx
1188
1189	movq	%rsi, %rcx	// max (0, argc-4) -> %rcx
1190
1191KMP_LABEL(kmp_invoke_push_parms):
1192	// push nth - 7th parms to pkfn on stack
1193	subq	$8, %rdx	// decrement p_argv pointer to previous parm
1194	movq	(%rdx), %rsi	// p_argv[%rcx-1] -> %rsi
1195	pushq	%rsi		// push p_argv[%rcx-1] onto stack (reverse order)
1196	subl	$1, %ecx
1197
1198// C69570: "X86_64_RELOC_BRANCH not supported" error at linking on mac_32e
1199//		if the name of the label that is an operand of this jecxz starts with a dot (".");
1200//	   Apple's linker does not support 1-byte length relocation;
1201//         Resolution: replace all .labelX entries with L_labelX.
1202
1203	jecxz   KMP_LABEL(kmp_invoke_pass_parms)  // stop when four p_argv[] parms left
1204	jmp	KMP_LABEL(kmp_invoke_push_parms)
1205	ALIGN 3
1206KMP_LABEL(kmp_invoke_pass_parms):	// put 1st - 6th parms to pkfn in registers.
1207				// order here is important to avoid trashing
1208				// registers used for both input and output parms!
1209	movq	%rdi, %rbx	// pkfn -> %rbx
1210	leaq	__gtid(%rbp), %rdi // &gtid -> %rdi (store 1st parm to pkfn)
1211	leaq	__tid(%rbp), %rsi  // &tid -> %rsi (store 2nd parm to pkfn)
1212	// Check if argc is 0
1213	cmpq $0, %rax
1214	je KMP_LABEL(kmp_no_args) // Jump ahead
1215
1216	movq	%r8, %r11	// p_argv -> %r11
1217
1218#if KMP_MIC
1219	cmpq	$4, %rax	// argc >= 4?
1220	jns	KMP_LABEL(kmp_4)	// jump to movq
1221	jmp	KMP_LABEL(kmp_4_exit)	// jump ahead
1222KMP_LABEL(kmp_4):
1223	movq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
1224KMP_LABEL(kmp_4_exit):
1225
1226	cmpq	$3, %rax	// argc >= 3?
1227	jns	KMP_LABEL(kmp_3)	// jump to movq
1228	jmp	KMP_LABEL(kmp_3_exit)	// jump ahead
1229KMP_LABEL(kmp_3):
1230	movq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
1231KMP_LABEL(kmp_3_exit):
1232
1233	cmpq	$2, %rax	// argc >= 2?
1234	jns	KMP_LABEL(kmp_2)	// jump to movq
1235	jmp	KMP_LABEL(kmp_2_exit)	// jump ahead
1236KMP_LABEL(kmp_2):
1237	movq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
1238KMP_LABEL(kmp_2_exit):
1239
1240	cmpq	$1, %rax	// argc >= 1?
1241	jns	KMP_LABEL(kmp_1)	// jump to movq
1242	jmp	KMP_LABEL(kmp_1_exit)	// jump ahead
1243KMP_LABEL(kmp_1):
1244	movq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
1245KMP_LABEL(kmp_1_exit):
1246#else
1247	cmpq	$4, %rax	// argc >= 4?
1248	cmovnsq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
1249
1250	cmpq	$3, %rax	// argc >= 3?
1251	cmovnsq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
1252
1253	cmpq	$2, %rax	// argc >= 2?
1254	cmovnsq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
1255
1256	cmpq	$1, %rax	// argc >= 1?
1257	cmovnsq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
1258#endif // KMP_MIC
1259
1260KMP_LABEL(kmp_no_args):
1261	call	*%rbx		// call (*pkfn)();
1262	movq	$1, %rax	// move 1 into return register;
1263
1264	movq	-8(%rbp), %rbx	// restore %rbx	using %rbp since %rsp was modified
1265	movq 	%rbp, %rsp	// restore stack pointer
1266	popq 	%rbp		// restore frame pointer
1267	KMP_CFI_DEF rsp,8
1268	ret
1269
1270	DEBUG_INFO __kmp_invoke_microtask
1271// -- End  __kmp_invoke_microtask
1272
1273// kmp_uint64
1274// __kmp_hardware_timestamp(void)
1275        .text
1276	PROC  __kmp_hardware_timestamp
1277	rdtsc
1278	shlq    $32, %rdx
1279	orq     %rdx, %rax
1280	ret
1281
1282	DEBUG_INFO __kmp_hardware_timestamp
1283// -- End  __kmp_hardware_timestamp
1284
1285//------------------------------------------------------------------------
1286// FUNCTION __kmp_bsr32
1287//
1288// int
1289// __kmp_bsr32( int );
1290        .text
1291        PROC  __kmp_bsr32
1292
1293        bsr    %edi,%eax
1294        ret
1295
1296        DEBUG_INFO __kmp_bsr32
1297
1298// -----------------------------------------------------------------------
1299#endif /* KMP_ARCH_X86_64 */
1300
1301// '
1302#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32)
1303
1304//------------------------------------------------------------------------
1305// int
1306// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1307//                         int gtid, int tid,
1308//                         int argc, void *p_argv[]
1309// #if OMPT_SUPPORT
1310//                         ,
1311//                         void **exit_frame_ptr
1312// #endif
1313//                       ) {
1314// #if OMPT_SUPPORT
1315//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1316// #endif
1317//
1318//   (*pkfn)( & gtid, & tid, argv[0], ... );
1319//
1320// // FIXME: This is done at call-site and can be removed here.
1321// #if OMPT_SUPPORT
1322//   *exit_frame_ptr = 0;
1323// #endif
1324//
1325//   return 1;
1326// }
1327//
1328// parameters:
1329//	x0:	pkfn
1330//	w1:	gtid
1331//	w2:	tid
1332//	w3:	argc
1333//	x4:	p_argv
1334//	x5:	&exit_frame
1335//
1336// locals:
1337//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
1338//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
1339//
1340// reg temps:
1341//	 x8:	used to hold pkfn address
1342//	 w9:	used as temporary for number of pkfn parms
1343//	x10:	used to traverse p_argv array
1344//	x11:	used as temporary for stack placement calculation
1345//	x12:	used as temporary for stack parameters
1346//	x19:	used to preserve exit_frame_ptr, callee-save
1347//
1348// return:	w0	(always 1/TRUE)
1349//
1350
1351__gtid = 4
1352__tid = 8
1353
1354// -- Begin __kmp_invoke_microtask
1355// mark_begin;
1356	.text
1357	PROC __kmp_invoke_microtask
1358	PACBTI_C
1359
1360	stp	x29, x30, [sp, #-16]!
1361# if OMPT_SUPPORT
1362	stp	x19, x20, [sp, #-16]!
1363# endif
1364	mov	x29, sp
1365
1366	orr	w9, wzr, #1
1367	add	w9, w9, w3, lsr #1
1368	sub	sp, sp, w9, uxtw #4
1369	mov	x11, sp
1370
1371	mov	x8, x0
1372	str	w1, [x29, #-__gtid]
1373	str	w2, [x29, #-__tid]
1374	mov	w9, w3
1375	mov	x10, x4
1376# if OMPT_SUPPORT
1377	mov	x19, x5
1378	str	x29, [x19]
1379# endif
1380
1381	sub	x0, x29, #__gtid
1382	sub	x1, x29, #__tid
1383
1384	cbz	w9, KMP_LABEL(kmp_1)
1385	ldr	x2, [x10]
1386
1387	sub	w9, w9, #1
1388	cbz	w9, KMP_LABEL(kmp_1)
1389	ldr	x3, [x10, #8]!
1390
1391	sub	w9, w9, #1
1392	cbz	w9, KMP_LABEL(kmp_1)
1393	ldr	x4, [x10, #8]!
1394
1395	sub	w9, w9, #1
1396	cbz	w9, KMP_LABEL(kmp_1)
1397	ldr	x5, [x10, #8]!
1398
1399	sub	w9, w9, #1
1400	cbz	w9, KMP_LABEL(kmp_1)
1401	ldr	x6, [x10, #8]!
1402
1403	sub	w9, w9, #1
1404	cbz	w9, KMP_LABEL(kmp_1)
1405	ldr	x7, [x10, #8]!
1406
1407KMP_LABEL(kmp_0):
1408	sub	w9, w9, #1
1409	cbz	w9, KMP_LABEL(kmp_1)
1410	ldr	x12, [x10, #8]!
1411	str	x12, [x11], #8
1412	b	KMP_LABEL(kmp_0)
1413KMP_LABEL(kmp_1):
1414	blr	x8
1415	orr	w0, wzr, #1
1416	mov	sp, x29
1417# if OMPT_SUPPORT
1418	str	xzr, [x19]
1419	ldp	x19, x20, [sp], #16
1420# endif
1421	ldp	x29, x30, [sp], #16
1422	PACBTI_RET
1423	ret
1424
1425	DEBUG_INFO __kmp_invoke_microtask
1426// -- End  __kmp_invoke_microtask
1427
1428#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32) */
1429
1430#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_ARM
1431
1432//------------------------------------------------------------------------
1433// int
1434// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1435//                         int gtid, int tid,
1436//                         int argc, void *p_argv[]
1437// #if OMPT_SUPPORT
1438//                         ,
1439//                         void **exit_frame_ptr
1440// #endif
1441//                       ) {
1442// #if OMPT_SUPPORT
1443//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1444// #endif
1445//
1446//   (*pkfn)( & gtid, & tid, argv[0], ... );
1447//
1448// // FIXME: This is done at call-site and can be removed here.
1449// #if OMPT_SUPPORT
1450//   *exit_frame_ptr = 0;
1451// #endif
1452//
1453//   return 1;
1454// }
1455//
1456// parameters:
1457//	r0:	pkfn
1458//	r1:	gtid
1459//	r2:	tid
1460//	r3:	argc
1461//	r4(stack):	p_argv
1462//	r5(stack):	&exit_frame
1463//
1464// locals:
1465//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
1466//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
1467//
1468// reg temps:
1469//	 r4:	used to hold pkfn address
1470//	 r5:	used as temporary for number of pkfn parms
1471//	 r6:	used to traverse p_argv array
1472//	 r7:	frame pointer (in some configurations)
1473//	 r8:	used as temporary for stack placement calculation
1474//	 	and as pointer to base of callee saved area
1475//	 r9:	used as temporary for stack parameters
1476//	r10:	used to preserve exit_frame_ptr, callee-save
1477//	r11:	frame pointer (in some configurations)
1478//
1479// return:	r0	(always 1/TRUE)
1480//
1481
1482__gtid = 4
1483__tid = 8
1484
1485// -- Begin __kmp_invoke_microtask
1486// mark_begin;
1487	.text
1488	PROC __kmp_invoke_microtask
1489
1490	// Pushing one extra register (r3) to keep the stack aligned
1491	// for when we call pkfn below
1492	push	{r3-r11,lr}
1493	// Load p_argv and &exit_frame
1494	ldr	r4, [sp, #10*4]
1495# if OMPT_SUPPORT
1496	ldr	r5, [sp, #11*4]
1497# endif
1498
1499# if KMP_OS_DARWIN || (defined(__thumb__) && !KMP_OS_WINDOWS)
1500# define FP r7
1501# define FPOFF 4*4
1502#else
1503# define FP r11
1504# define FPOFF 8*4
1505#endif
1506	add	FP, sp, #FPOFF
1507# if OMPT_SUPPORT
1508	mov	r10, r5
1509	str	FP, [r10]
1510# endif
1511	mov	r8, sp
1512
1513	// Calculate how much stack to allocate, in increments of 8 bytes.
1514	// We strictly need 4*(argc-2) bytes (2 arguments are passed in
1515	// registers) but allocate 4*argc for simplicity (to avoid needing
1516	// to handle the argc<2 cases). We align the number of bytes
1517	// allocated to 8 bytes, to keep the stack aligned. (Since we
1518	// already allocate more than enough, it's ok to round down
1519	// instead of up for the alignment.) We allocate another extra
1520	// 8 bytes for gtid and tid.
1521	mov	r5, #1
1522	add	r5, r5, r3, lsr #1
1523	sub	sp, sp, r5, lsl #3
1524
1525	str	r1, [r8, #-__gtid]
1526	str	r2, [r8, #-__tid]
1527	mov	r5, r3
1528	mov	r6, r4
1529	mov	r4, r0
1530
1531	// Prepare the first 2 parameters to pkfn - pointers to gtid and tid
1532	// in our stack frame.
1533	sub	r0, r8, #__gtid
1534	sub	r1, r8, #__tid
1535
1536	mov	r8, sp
1537
1538	// Load p_argv[0] and p_argv[1] into r2 and r3, if argc >= 1/2
1539	cmp	r5, #0
1540	beq	KMP_LABEL(kmp_1)
1541	ldr	r2, [r6]
1542
1543	subs	r5, r5, #1
1544	beq	KMP_LABEL(kmp_1)
1545	ldr	r3, [r6, #4]!
1546
1547	// Loop, loading the rest of p_argv and writing the elements on the
1548	// stack.
1549KMP_LABEL(kmp_0):
1550	subs	r5, r5, #1
1551	beq	KMP_LABEL(kmp_1)
1552	ldr	r12, [r6, #4]!
1553	str	r12, [r8], #4
1554	b	KMP_LABEL(kmp_0)
1555KMP_LABEL(kmp_1):
1556	blx	r4
1557	mov	r0, #1
1558
1559	sub	r4, FP, #FPOFF
1560	mov	sp, r4
1561# undef FP
1562# undef FPOFF
1563
1564# if OMPT_SUPPORT
1565	mov	r1, #0
1566	str	r1, [r10]
1567# endif
1568	pop	{r3-r11,pc}
1569
1570	DEBUG_INFO __kmp_invoke_microtask
1571// -- End  __kmp_invoke_microtask
1572
1573#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_ARM */
1574
1575#if KMP_ARCH_PPC64
1576
1577//------------------------------------------------------------------------
1578// int
1579// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1580//                         int gtid, int tid,
1581//                         int argc, void *p_argv[]
1582// #if OMPT_SUPPORT
1583//                         ,
1584//                         void **exit_frame_ptr
1585// #endif
1586//                       ) {
1587// #if OMPT_SUPPORT
1588//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1589// #endif
1590//
1591//   (*pkfn)( & gtid, & tid, argv[0], ... );
1592//
1593// // FIXME: This is done at call-site and can be removed here.
1594// #if OMPT_SUPPORT
1595//   *exit_frame_ptr = 0;
1596// #endif
1597//
1598//   return 1;
1599// }
1600//
1601// parameters:
1602//	r3:	pkfn
1603//	r4:	gtid
1604//	r5:	tid
1605//	r6:	argc
1606//	r7:	p_argv
1607//	r8:	&exit_frame
1608//
1609// return:	r3	(always 1/TRUE)
1610//
1611	.text
1612# if KMP_ARCH_PPC64_ELFv2
1613	.abiversion 2
1614# endif
1615	.globl	__kmp_invoke_microtask
1616
1617# if KMP_ARCH_PPC64_ELFv2
1618	.p2align	4
1619# else
1620	.p2align	2
1621# endif
1622
1623	.type	__kmp_invoke_microtask,@function
1624
1625# if KMP_ARCH_PPC64_ELFv2
1626__kmp_invoke_microtask:
1627.Lfunc_begin0:
1628.Lfunc_gep0:
1629	addis 2, 12, .TOC.-.Lfunc_gep0@ha
1630	addi 2, 2, .TOC.-.Lfunc_gep0@l
1631.Lfunc_lep0:
1632	.localentry	__kmp_invoke_microtask, .Lfunc_lep0-.Lfunc_gep0
1633# else
1634	.section	.opd,"aw",@progbits
1635__kmp_invoke_microtask:
1636	.p2align	3
1637	.quad	.Lfunc_begin0
1638	.quad	.TOC.@tocbase
1639	.quad	0
1640	.text
1641.Lfunc_begin0:
1642# endif
1643
1644// -- Begin __kmp_invoke_microtask
1645// mark_begin;
1646
1647// We need to allocate a stack frame large enough to hold all of the parameters
1648// on the stack for the microtask plus what this function needs. That's 48
1649// bytes under the ELFv1 ABI (32 bytes under ELFv2), plus 8*(2 + argc) for the
1650// parameters to the microtask, plus 8 bytes to store the values of r4 and r5,
1651// and 8 bytes to store r31. With OMP-T support, we need an additional 8 bytes
1652// to save r30 to hold a copy of r8.
1653
1654	.cfi_startproc
1655	mflr 0
1656	std 31, -8(1)
1657	std 0, 16(1)
1658
1659// This is unusual because normally we'd set r31 equal to r1 after the stack
1660// frame is established. In this case, however, we need to dynamically compute
1661// the stack frame size, and so we keep a direct copy of r1 to access our
1662// register save areas and restore the r1 value before returning.
1663	mr 31, 1
1664	.cfi_def_cfa_register r31
1665	.cfi_offset r31, -8
1666	.cfi_offset lr, 16
1667
1668// Compute the size necessary for the local stack frame.
1669# if KMP_ARCH_PPC64_ELFv2
1670	li 12, 72
1671# else
1672	li 12, 88
1673# endif
1674	sldi 0, 6, 3
1675	add 12, 0, 12
1676	neg 12, 12
1677
1678// We need to make sure that the stack frame stays aligned (to 16 bytes).
1679	li 0, -16
1680	and 12, 0, 12
1681
1682// Establish the local stack frame.
1683	stdux 1, 1, 12
1684
1685# if OMPT_SUPPORT
1686	.cfi_offset r30, -16
1687	std 30, -16(31)
1688	std 1, 0(8)
1689	mr 30, 8
1690# endif
1691
1692// Store gtid and tid to the stack because they're passed by reference to the microtask.
1693	stw 4, -20(31)
1694	stw 5, -24(31)
1695
1696	mr 12, 6
1697	mr 4, 7
1698
1699	cmpwi 0, 12, 1
1700	blt	 0, .Lcall
1701
1702	ld 5, 0(4)
1703
1704	cmpwi 0, 12, 2
1705	blt	 0, .Lcall
1706
1707	ld 6, 8(4)
1708
1709	cmpwi 0, 12, 3
1710	blt	 0, .Lcall
1711
1712	ld 7, 16(4)
1713
1714	cmpwi 0, 12, 4
1715	blt	 0, .Lcall
1716
1717	ld 8, 24(4)
1718
1719	cmpwi 0, 12, 5
1720	blt	 0, .Lcall
1721
1722	ld 9, 32(4)
1723
1724	cmpwi 0, 12, 6
1725	blt	 0, .Lcall
1726
1727	ld 10, 40(4)
1728
1729	cmpwi 0, 12, 7
1730	blt	 0, .Lcall
1731
1732// There are more than 6 microtask parameters, so we need to store the
1733// remainder to the stack.
1734	addi 12, 12, -6
1735	mtctr 12
1736
1737// These are set to 8 bytes before the first desired store address (we're using
1738// pre-increment loads and stores in the loop below). The parameter save area
1739// for the microtask begins 48 + 8*8 == 112 bytes above r1 for ELFv1 and
1740// 32 + 8*8 == 96 bytes above r1 for ELFv2.
1741	addi 4, 4, 40
1742# if KMP_ARCH_PPC64_ELFv2
1743	addi 12, 1, 88
1744# else
1745	addi 12, 1, 104
1746# endif
1747
1748.Lnext:
1749	ldu 0, 8(4)
1750	stdu 0, 8(12)
1751	bdnz .Lnext
1752
1753.Lcall:
1754# if KMP_ARCH_PPC64_ELFv2
1755	std 2, 24(1)
1756	mr 12, 3
1757#else
1758	std 2, 40(1)
1759// For ELFv1, we need to load the actual function address from the function descriptor.
1760	ld 12, 0(3)
1761	ld 2, 8(3)
1762	ld 11, 16(3)
1763#endif
1764
1765	addi 3, 31, -20
1766	addi 4, 31, -24
1767
1768	mtctr 12
1769	bctrl
1770# if KMP_ARCH_PPC64_ELFv2
1771	ld 2, 24(1)
1772# else
1773	ld 2, 40(1)
1774# endif
1775
1776# if OMPT_SUPPORT
1777	li 3, 0
1778	std 3, 0(30)
1779# endif
1780
1781	li 3, 1
1782
1783# if OMPT_SUPPORT
1784	ld 30, -16(31)
1785# endif
1786
1787	mr 1, 31
1788	ld 0, 16(1)
1789	ld 31, -8(1)
1790	mtlr 0
1791	blr
1792
1793	.long	0
1794	.quad	0
1795.Lfunc_end0:
1796	.size	__kmp_invoke_microtask, .Lfunc_end0-.Lfunc_begin0
1797	.cfi_endproc
1798
1799// -- End  __kmp_invoke_microtask
1800
1801#endif /* KMP_ARCH_PPC64 */
1802
1803#if KMP_ARCH_RISCV64
1804
1805//------------------------------------------------------------------------
1806//
1807// typedef void (*microtask_t)(int *gtid, int *tid, ...);
1808//
1809// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
1810//                            void *p_argv[]
1811// #if OMPT_SUPPORT
1812//                            ,
1813//                            void **exit_frame_ptr
1814// #endif
1815//                            ) {
1816// #if OMPT_SUPPORT
1817//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1818// #endif
1819//
1820//   (*pkfn)(&gtid, &tid, argv[0], ...);
1821//
1822//   return 1;
1823// }
1824//
1825// Parameters:
1826//   a0: pkfn
1827//   a1: gtid
1828//   a2: tid
1829//   a3: argc
1830//   a4: p_argv
1831//   a5: exit_frame_ptr
1832//
1833// Locals:
1834//   __gtid: gtid param pushed on stack so can pass &gtid to pkfn
1835//   __tid: tid param pushed on stack so can pass &tid to pkfn
1836//
1837// Temp. registers:
1838//
1839//  t0: used to calculate the dynamic stack size / used to hold pkfn address
1840//  t1: used as temporary for stack placement calculation
1841//  t2: used as temporary for stack arguments
1842//  t3: used as temporary for number of remaining pkfn parms
1843//  t4: used to traverse p_argv array
1844//
1845// return: a0 (always 1/TRUE)
1846//
1847
1848__gtid = -20
1849__tid = -24
1850
1851// -- Begin __kmp_invoke_microtask
1852// mark_begin;
1853	.text
1854	.globl	__kmp_invoke_microtask
1855	.p2align	1
1856	.type	__kmp_invoke_microtask,@function
1857__kmp_invoke_microtask:
1858	.cfi_startproc
1859
1860	// First, save ra and fp
1861	addi	sp, sp, -16
1862	sd	ra, 8(sp)
1863	sd	fp, 0(sp)
1864	addi	fp, sp, 16
1865	.cfi_def_cfa	fp, 0
1866	.cfi_offset	ra, -8
1867	.cfi_offset	fp, -16
1868
1869	// Compute the dynamic stack size:
1870	//
1871	// - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them by
1872	//   reference
1873	// - We need 8 bytes for each argument that cannot be passed to the 'pkfn'
1874	//   function by register. Given that we have 8 of such registers (a[0-7])
1875	//   and two + 'argc' arguments (consider &gtid and &tid), we need to
1876	//   reserve max(0, argc - 6)*8 extra bytes
1877	//
1878	// The total number of bytes is then max(0, argc - 6)*8 + 8
1879
1880	// Compute max(0, argc - 6) using the following bithack:
1881	// max(0, x) = x - (x & (x >> 31)), where x := argc - 6
1882	// Source: http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
1883	addi	t0, a3, -6
1884	srai	t1, t0, 31
1885	and	t1, t0, t1
1886	sub	t0, t0, t1
1887
1888	addi	t0, t0, 1
1889
1890	slli	t0, t0, 3
1891	sub	sp, sp, t0
1892
1893	// Align the stack to 16 bytes
1894	andi	sp, sp, -16
1895
1896	mv	t0, a0
1897	mv	t3, a3
1898	mv	t4, a4
1899
1900#if OMPT_SUPPORT
1901	// Save frame pointer into exit_frame
1902	sd	fp, 0(a5)
1903#endif
1904
1905	// Prepare arguments for the pkfn function (first 8 using a0-a7 registers)
1906
1907	sw	a1, __gtid(fp)
1908	sw	a2, __tid(fp)
1909
1910	addi	a0, fp, __gtid
1911	addi	a1, fp, __tid
1912
1913	beqz	t3, .L_kmp_3
1914	ld	a2, 0(t4)
1915
1916	addi	t3, t3, -1
1917	beqz	t3, .L_kmp_3
1918	ld	a3, 8(t4)
1919
1920	addi	t3, t3, -1
1921	beqz	t3, .L_kmp_3
1922	ld	a4, 16(t4)
1923
1924	addi	t3, t3, -1
1925	beqz	t3, .L_kmp_3
1926	ld	a5, 24(t4)
1927
1928	addi	t3, t3, -1
1929	beqz	t3, .L_kmp_3
1930	ld	a6, 32(t4)
1931
1932	addi	t3, t3, -1
1933	beqz	t3, .L_kmp_3
1934	ld	a7, 40(t4)
1935
1936	// Prepare any additional argument passed through the stack
1937	addi	t4, t4, 48
1938	mv	t1, sp
1939	j .L_kmp_2
1940.L_kmp_1:
1941	ld	t2, 0(t4)
1942	sd	t2, 0(t1)
1943	addi	t4, t4, 8
1944	addi	t1, t1, 8
1945.L_kmp_2:
1946	addi	t3, t3, -1
1947	bnez	t3, .L_kmp_1
1948
1949.L_kmp_3:
1950	// Call pkfn function
1951	jalr	t0
1952
1953	// Restore stack and return
1954
1955	addi	a0, zero, 1
1956
1957	addi	sp, fp, -16
1958	ld	fp, 0(sp)
1959	ld	ra, 8(sp)
1960	addi	sp, sp, 16
1961	ret
1962.Lfunc_end0:
1963	.size	__kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
1964	.cfi_endproc
1965
1966// -- End  __kmp_invoke_microtask
1967
1968#endif /* KMP_ARCH_RISCV64 */
1969
1970#if KMP_ARCH_LOONGARCH64
1971
1972//------------------------------------------------------------------------
1973//
1974// typedef void (*microtask_t)(int *gtid, int *tid, ...);
1975//
1976// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
1977//                            void *p_argv[]
1978// #if OMPT_SUPPORT
1979//                            ,
1980//                            void **exit_frame_ptr
1981// #endif
1982//                            ) {
1983// #if OMPT_SUPPORT
1984//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1985// #endif
1986//
1987//   (*pkfn)(&gtid, &tid, argv[0], ...);
1988//
1989//   return 1;
1990// }
1991//
1992// Parameters:
1993//   a0: pkfn
1994//   a1: gtid
1995//   a2: tid
1996//   a3: argc
1997//   a4: p_argv
1998//   a5: exit_frame_ptr
1999//
2000// Locals:
2001//   __gtid: gtid param pushed on stack so can pass &gtid to pkfn
2002//   __tid: tid param pushed on stack so can pass &tid to pkfn
2003//
2004// Temp registers:
2005//
2006//  t0: used to calculate the dynamic stack size / used to hold pkfn address
2007//  t1: used as temporary for stack placement calculation
2008//  t2: used as temporary for stack arguments
2009//  t3: used as temporary for number of remaining pkfn parms
2010//  t4: used to traverse p_argv array
2011//
2012// return: a0 (always 1/TRUE)
2013//
2014
2015// -- Begin __kmp_invoke_microtask
2016// mark_begin;
2017	.text
2018	.globl	__kmp_invoke_microtask
2019	.p2align	2
2020	.type	__kmp_invoke_microtask,@function
2021__kmp_invoke_microtask:
2022	.cfi_startproc
2023
2024	// First, save ra and fp
2025	addi.d	$sp, $sp, -16
2026	st.d	$ra, $sp, 8
2027	st.d	$fp, $sp, 0
2028	addi.d	$fp, $sp, 16
2029	.cfi_def_cfa	22, 0
2030	.cfi_offset	1, -8
2031	.cfi_offset	22, -16
2032
2033	// Compute the dynamic stack size:
2034	//
2035	// - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them by
2036	//   reference
2037	// - We need 8 bytes for each argument that cannot be passed to the 'pkfn'
2038	//   function by register. Given that we have 8 of such registers (a[0-7])
2039	//   and two + 'argc' arguments (consider &gtid and &tid), we need to
2040	//   reserve max(0, argc - 6)*8 extra bytes
2041	//
2042	// The total number of bytes is then max(0, argc - 6)*8 + 8
2043
2044	addi.d  $t0, $a3, -6
2045	slt  $t1, $t0, $zero
2046	masknez  $t0, $t0, $t1
2047	addi.d  $t0, $t0, 1
2048	slli.d	$t0, $t0, 3
2049	sub.d	$sp, $sp, $t0
2050
2051	// Align the stack to 16 bytes
2052	bstrins.d $sp, $zero, 3, 0
2053
2054	move	$t0, $a0
2055	move	$t3, $a3
2056	move	$t4, $a4
2057
2058#if OMPT_SUPPORT
2059	// Save frame pointer into exit_frame
2060	st.d	$fp, $a5, 0
2061#endif
2062
2063	// Prepare arguments for the pkfn function (first 8 using a0-a7 registers)
2064
2065	st.w	$a1, $fp, -20
2066	st.w	$a2, $fp, -24
2067
2068	addi.d	$a0, $fp, -20
2069	addi.d	$a1, $fp, -24
2070
2071	beqz	$t3, .L_kmp_3
2072	ld.d	$a2, $t4, 0
2073
2074	addi.d	$t3, $t3, -1
2075	beqz	$t3, .L_kmp_3
2076	ld.d	$a3, $t4, 8
2077
2078	addi.d	$t3, $t3, -1
2079	beqz	$t3, .L_kmp_3
2080	ld.d	$a4, $t4, 16
2081
2082	addi.d	$t3, $t3, -1
2083	beqz	$t3, .L_kmp_3
2084	ld.d	$a5, $t4, 24
2085
2086	addi.d	$t3, $t3, -1
2087	beqz	$t3, .L_kmp_3
2088	ld.d	$a6, $t4, 32
2089
2090	addi.d	$t3, $t3, -1
2091	beqz	$t3, .L_kmp_3
2092	ld.d	$a7, $t4, 40
2093
2094	// Prepare any additional argument passed through the stack
2095	addi.d	$t4, $t4, 48
2096	move	$t1, $sp
2097	b .L_kmp_2
2098.L_kmp_1:
2099	ld.d	$t2, $t4, 0
2100	st.d	$t2, $t1, 0
2101	addi.d	$t4, $t4, 8
2102	addi.d	$t1, $t1, 8
2103.L_kmp_2:
2104	addi.d	$t3, $t3, -1
2105	bnez	$t3, .L_kmp_1
2106
2107.L_kmp_3:
2108	// Call pkfn function
2109	jirl	$ra, $t0, 0
2110
2111	// Restore stack and return
2112
2113	addi.d	$a0, $zero, 1
2114
2115	addi.d	$sp, $fp, -16
2116	ld.d	$fp, $sp, 0
2117	ld.d	$ra, $sp, 8
2118	addi.d	$sp, $sp, 16
2119	jr $ra
2120.Lfunc_end0:
2121	.size	__kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
2122	.cfi_endproc
2123
2124// -- End  __kmp_invoke_microtask
2125
2126#endif /* KMP_ARCH_LOONGARCH64 */
2127
2128#if KMP_ARCH_VE
2129
2130//------------------------------------------------------------------------
2131//
2132// typedef void (*microtask_t)(int *gtid, int *tid, ...);
2133//
2134// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
2135//                            void *p_argv[]
2136// #if OMPT_SUPPORT
2137//                            ,
2138//                            void **exit_frame_ptr
2139// #endif
2140//                            ) {
2141// #if OMPT_SUPPORT
2142//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
2143// #endif
2144//
2145//   (*pkfn)(&gtid, &tid, argv[0], ...);
2146//
2147//   return 1;
2148// }
2149//
2150// Parameters:
2151//   s0: pkfn
2152//   s1: gtid
2153//   s2: tid
2154//   s3: argc
2155//   s4: p_argv
2156//   s5: exit_frame_ptr
2157//
2158// Locals:
2159//   __gtid: gtid param pushed on stack so can pass &gtid to pkfn
2160//   __tid: tid param pushed on stack so can pass &tid to pkfn
2161//
2162// Temp. registers:
2163//
2164//  s34: used to calculate the dynamic stack size
2165//  s35: used as temporary for stack placement calculation
2166//  s36: used as temporary for stack arguments
2167//  s37: used as temporary for number of remaining pkfn parms
2168//  s38: used to traverse p_argv array
2169//
2170// return: s0 (always 1/TRUE)
2171//
2172
2173__gtid = -4
2174__tid = -8
2175
2176// -- Begin __kmp_invoke_microtask
2177// mark_begin;
2178	.text
2179	.globl	__kmp_invoke_microtask
2180	// A function requires 8 bytes align.
2181	.p2align	3
2182	.type	__kmp_invoke_microtask,@function
2183__kmp_invoke_microtask:
2184	.cfi_startproc
2185
2186	// First, save fp and lr.  VE stores them at caller stack frame.
2187	st	%fp, 0(, %sp)
2188	st	%lr, 8(, %sp)
2189	or	%fp, 0, %sp
2190	.cfi_def_cfa	%fp, 0
2191	.cfi_offset	%lr, 8
2192	.cfi_offset	%fp, 0
2193
2194	// Compute the dynamic stack size:
2195	//
2196	// - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them
2197	//   by reference
2198	// - We need 8 bytes for whole arguments.  We have two + 'argc'
2199	//   arguments (condider &gtid and &tid).  We need to reserve
2200	//   (argc + 2) * 8 bytes.
2201	// - We need 176 bytes for RSA and others
2202	//
2203	// The total number of bytes is then (argc + 2) * 8 + 8 + 176.
2204	//
2205	// |------------------------------|
2206	// | return address of callee     | 8(%fp)
2207	// |------------------------------|
2208	// | frame pointer of callee      | 0(%fp)
2209	// |------------------------------| <------------------ %fp
2210	// | __tid / __gtid               | -8(%fp) / -4(%fp)
2211	// |------------------------------|
2212	// | argc+2 for arguments         | 176(%sp)
2213	// |------------------------------|
2214	// | RSA                          |
2215	// |------------------------------|
2216	// | return address               |
2217	// |------------------------------|
2218	// | frame pointer                |
2219	// |------------------------------| <------------------ %sp
2220
2221	adds.w.sx	%s34, 2, %s3
2222	sll	%s34, %s34, 3
2223	lea	%s34, 184(, %s34)
2224	subs.l	%sp, %sp, %s34
2225
2226	// Align the stack to 16 bytes.
2227	and	%sp, -16, %sp
2228
2229	// Save pkfn.
2230	or	%s12, 0, %s0
2231
2232	// Call host to allocate stack if it is necessary.
2233	brge.l	%sp, %sl, .L_kmp_pass
2234	ld	%s61, 24(, %tp)
2235	lea	%s63, 0x13b
2236	shm.l	%s63, 0(%s61)
2237	shm.l	%sl, 8(%s61)
2238	shm.l	%sp, 16(%s61)
2239	monc
2240
2241.L_kmp_pass:
2242	lea	%s35, 176(, %sp)
2243	adds.w.sx	%s37, 0, %s3
2244	or	%s38, 0, %s4
2245
2246#if OMPT_SUPPORT
2247	// Save frame pointer into exit_frame.
2248	st	%fp, 0(%s5)
2249#endif
2250
2251	// Prepare arguments for the pkfn function (first 8 using s0-s7
2252	// registers, but need to store stack also because of varargs).
2253
2254	stl	%s1, __gtid(%fp)
2255	stl	%s2, __tid(%fp)
2256
2257	adds.l	%s0, __gtid, %fp
2258	st	%s0, 0(, %s35)
2259	adds.l	%s1, __tid, %fp
2260	st	%s1, 8(, %s35)
2261
2262	breq.l	0, %s37, .L_kmp_call
2263	ld	%s2, 0(, %s38)
2264	st	%s2, 16(, %s35)
2265
2266	breq.l	1, %s37, .L_kmp_call
2267	ld	%s3, 8(, %s38)
2268	st	%s3, 24(, %s35)
2269
2270	breq.l	2, %s37, .L_kmp_call
2271	ld	%s4, 16(, %s38)
2272	st	%s4, 32(, %s35)
2273
2274	breq.l	3, %s37, .L_kmp_call
2275	ld	%s5, 24(, %s38)
2276	st	%s5, 40(, %s35)
2277
2278	breq.l	4, %s37, .L_kmp_call
2279	ld	%s6, 32(, %s38)
2280	st	%s6, 48(, %s35)
2281
2282	breq.l	5, %s37, .L_kmp_call
2283	ld	%s7, 40(, %s38)
2284	st	%s7, 56(, %s35)
2285
2286	breq.l	6, %s37, .L_kmp_call
2287
2288	// Prepare any additional argument passed through the stack.
2289	adds.l	%s37, -6, %s37
2290	lea	%s38, 48(, %s38)
2291	lea	%s35, 64(, %s35)
2292.L_kmp_loop:
2293	ld	%s36, 0(, %s38)
2294	st	%s36, 0(, %s35)
2295	adds.l	%s37, -1, %s37
2296	adds.l	%s38, 8, %s38
2297	adds.l	%s35, 8, %s35
2298	brne.l	0, %s37, .L_kmp_loop
2299
2300.L_kmp_call:
2301	// Call pkfn function.
2302	bsic	%lr, (, %s12)
2303
2304	// Return value.
2305	lea	%s0, 1
2306
2307	// Restore stack and return.
2308	or	%sp, 0, %fp
2309	ld	%lr, 8(, %sp)
2310	ld	%fp, 0(, %sp)
2311	b.l.t	(, %lr)
2312.Lfunc_end0:
2313	.size	__kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
2314	.cfi_endproc
2315
2316// -- End  __kmp_invoke_microtask
2317
2318#endif /* KMP_ARCH_VE */
2319
2320#if KMP_ARCH_S390X
2321
2322//------------------------------------------------------------------------
2323//
2324// typedef void (*microtask_t)(int *gtid, int *tid, ...);
2325//
2326// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
2327//                            void *p_argv[]
2328// #if OMPT_SUPPORT
2329//                            ,
2330//                            void **exit_frame_ptr
2331// #endif
2332//                            ) {
2333// #if OMPT_SUPPORT
2334//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
2335// #endif
2336//
2337//   (*pkfn)(&gtid, &tid, argv[0], ...);
2338//
2339//   return 1;
2340// }
2341//
2342// Parameters:
2343//   r2: pkfn
2344//   r3: gtid
2345//   r4: tid
2346//   r5: argc
2347//   r6: p_argv
2348//   SP+160: exit_frame_ptr
2349//
2350// Locals:
2351//   __gtid: gtid param pushed on stack so can pass &gtid to pkfn
2352//   __tid: tid param pushed on stack so can pass &tid to pkfn
2353//
2354// Temp. registers:
2355//
2356//  r0: used to fetch argv slots
2357//  r7: used as temporary for number of remaining pkfn parms
2358//  r8: argv
2359//  r9: pkfn
2360//  r10: stack size
2361//  r11: previous fp
2362//  r12: stack parameter area
2363//  r13: argv slot
2364//
2365// return: r2 (always 1/TRUE)
2366//
2367
2368// -- Begin __kmp_invoke_microtask
2369// mark_begin;
2370	.text
2371	.globl	__kmp_invoke_microtask
2372	.p2align	1
2373	.type	__kmp_invoke_microtask,@function
2374__kmp_invoke_microtask:
2375	.cfi_startproc
2376
2377	stmg	%r6,%r14,48(%r15)
2378        .cfi_offset %r6, -112
2379        .cfi_offset %r7, -104
2380        .cfi_offset %r8, -96
2381        .cfi_offset %r9, -88
2382        .cfi_offset %r10, -80
2383        .cfi_offset %r11, -72
2384        .cfi_offset %r12, -64
2385        .cfi_offset %r13, -56
2386        .cfi_offset %r14, -48
2387        .cfi_offset %r15, -40
2388	lgr	%r11,%r15
2389	.cfi_def_cfa %r11, 160
2390
2391	// Compute the dynamic stack size:
2392	//
2393	// - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them by
2394	//   reference
2395	// - We need 8 bytes for each argument that cannot be passed to the 'pkfn'
2396	//   function by register. Given that we have 5 of such registers (r[2-6])
2397	//   and two + 'argc' arguments (consider &gtid and &tid), we need to
2398	//   reserve max(0, argc - 3)*8 extra bytes
2399	//
2400	// The total number of bytes is then max(0, argc - 3)*8 + 8
2401
2402	lgr	%r10,%r5
2403	aghi	%r10,-2
2404	jnm	0f
2405	lghi	%r10,0
24060:
2407	sllg	%r10,%r10,3
2408	lgr	%r12,%r10
2409	aghi	%r10,176
2410	sgr 	%r15,%r10
2411	agr	%r12,%r15
2412	stg	%r11,0(%r15)
2413
2414	lgr	%r9,%r2			// pkfn
2415
2416#if OMPT_SUPPORT
2417	// Save frame pointer into exit_frame
2418	lg	%r8,160(%r11)
2419	stg	%r11,0(%r8)
2420#endif
2421
2422	// Prepare arguments for the pkfn function (first 5 using r2-r6 registers)
2423
2424	stg     %r3,160(%r12)
2425	la	%r2,164(%r12)		// gid
2426	stg	%r4,168(%r12)
2427	la	%r3,172(%r12)		// tid
2428	lgr	%r8,%r6			// argv
2429
2430	// If argc > 0
2431	ltgr	%r7,%r5
2432	jz	1f
2433
2434	lg	%r4,0(%r8)		// argv[0]
2435	aghi	%r7,-1
2436	jz	1f
2437
2438	// If argc > 1
2439	lg	%r5,8(%r8)		// argv[1]
2440	aghi	%r7,-1
2441	jz	1f
2442
2443	// If argc > 2
2444	lg	%r6,16(%r8)		// argv[2]
2445	aghi	%r7,-1
2446	jz	1f
2447
2448	lghi	%r13,0			// Index [n]
24492:
2450	lg	%r0,24(%r13,%r8)	// argv[2+n]
2451	stg	%r0,160(%r13,%r15)	// parm[2+n]
2452	aghi	%r13,8			// Next
2453	aghi	%r7,-1
2454	jnz	2b
2455
24561:
2457	basr	%r14,%r9		// Call pkfn
2458
2459	// Restore stack and return
2460
2461	lgr	%r15,%r11
2462	lmg	%r6,%r14,48(%r15)
2463	lghi	%r2,1
2464	br	%r14
2465.Lfunc_end0:
2466	.size	__kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
2467	.cfi_endproc
2468
2469// -- End  __kmp_invoke_microtask
2470
2471#endif /* KMP_ARCH_S390X */
2472
2473#if KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_AARCH64_32
2474#ifndef KMP_PREFIX_UNDERSCORE
2475# define KMP_PREFIX_UNDERSCORE(x) x
2476#endif
2477    .data
2478    COMMON .gomp_critical_user_, 32, 3
2479    .data
2480    .align 4
2481    .global KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr)
2482KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr):
2483    .4byte .gomp_critical_user_
2484#ifdef __ELF__
2485    .size KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),4
2486#endif
2487#endif /* KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_AARCH64_32 */
2488
2489#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 ||                   \
2490    KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE ||                 \
2491    KMP_ARCH_S390X
2492#ifndef KMP_PREFIX_UNDERSCORE
2493# define KMP_PREFIX_UNDERSCORE(x) x
2494#endif
2495    .data
2496    COMMON .gomp_critical_user_, 32, 3
2497    .data
2498    .align 8
2499    .global KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr)
2500KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr):
2501    .8byte .gomp_critical_user_
2502#ifdef __ELF__
2503    .size KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),8
2504#endif
2505#endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 ||
2506          KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE ||
2507          KMP_ARCH_S390X */
2508
2509#if KMP_OS_LINUX
2510# if KMP_ARCH_ARM || KMP_ARCH_AARCH64
2511.section .note.GNU-stack,"",%progbits
2512# elif !KMP_ARCH_WASM
2513.section .note.GNU-stack,"",@progbits
2514# endif
2515#endif
2516
2517#if KMP_OS_LINUX && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32)
2518GNU_PROPERTY_BTI_PAC
2519#endif
2520