xref: /llvm-project/llvm/test/MC/AMDGPU/hsa-v5-uses-dynamic-stack.s (revision 688acb13b1d1eb751c83466935ab12f2b8047930)
1// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s
2// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+xnack -filetype=obj < %s > %t
3// RUN: llvm-readelf -S -r -s %t | FileCheck --check-prefix=READOBJ %s
4// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
5
6// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 --amdhsa-code-object-version=6 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s
7// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 --amdhsa-code-object-version=6 -mattr=+xnack -filetype=obj < %s > %t
8// RUN: llvm-readelf -S -r -s %t | FileCheck --check-prefix=READOBJ %s
9// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
10
11// READOBJ: Section Headers
12// READOBJ: .text   PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9]+}} AX {{[0-9]+}} {{[0-9]+}} 256
13// READOBJ: .rodata PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}}        000100 {{[0-9]+}}  A {{[0-9]+}} {{[0-9]+}} 64
14
15// READOBJ: Relocation section '.rela.rodata' at offset
16// READOBJ: 0000000000000010 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 10
17// READOBJ: 0000000000000050 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 110
18// READOBJ: 0000000000000090 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 210
19// READOBJ: 00000000000000d0 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 310
20
21// READOBJ: Symbol table '.symtab' contains {{[0-9]+}} entries:
22// READOBJ:      0000000000000000  0 FUNC    LOCAL  PROTECTED 2 minimal
23// READOBJ-NEXT: 0000000000000100  0 FUNC    LOCAL  PROTECTED 2 complete
24// READOBJ-NEXT: 0000000000000200  0 FUNC    LOCAL  PROTECTED 2 special_sgpr
25// READOBJ-NEXT: 0000000000000300  0 FUNC    LOCAL  PROTECTED 2 disabled_user_sgpr
26// READOBJ-NEXT: 0000000000000000 64 OBJECT  LOCAL  DEFAULT   3 minimal.kd
27// READOBJ-NEXT: 0000000000000040 64 OBJECT  LOCAL  DEFAULT   3 complete.kd
28// READOBJ-NEXT: 0000000000000080 64 OBJECT  LOCAL  DEFAULT   3 special_sgpr.kd
29// READOBJ-NEXT: 00000000000000c0 64 OBJECT  LOCAL  DEFAULT   3 disabled_user_sgpr.kd
30
31// OBJDUMP: Contents of section .rodata
32// Note, relocation for KERNEL_CODE_ENTRY_BYTE_OFFSET is not resolved here.
33// minimal
34// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
35// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
36// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
37// OBJDUMP-NEXT: 0030 0000ac00 80000000 00000000 00000000
38// complete
39// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000
40// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
41// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
42// OBJDUMP-NEXT: 0070 c2500104 1f0f007f 7f080000 00000000
43// special_sgpr
44// OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000
45// OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000
46// OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000
47// OBJDUMP-NEXT: 00b0 00010000 80000000 00000000 00000000
48// disabled_user_sgpr
49// OBJDUMP-NEXT: 00c0 00000000 00000000 00000000 00000000
50// OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000
51// OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000
52// OBJDUMP-NEXT: 00f0 0000ac00 80000000 00000000 00000000
53
54.amdgcn_target "amdgcn-amd-amdhsa--gfx904:xnack+"
55// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx904:xnack+"
56
57.amdhsa_code_object_version 5
58// ASM: .amdhsa_code_object_version 5
59
60.p2align 8
61.type minimal,@function
62minimal:
63  s_endpgm
64
65.p2align 8
66.type complete,@function
67complete:
68  s_endpgm
69
70.p2align 8
71.type special_sgpr,@function
72special_sgpr:
73  s_endpgm
74
75.p2align 8
76.type disabled_user_sgpr,@function
77disabled_user_sgpr:
78  s_endpgm
79
80.rodata
81// ASM: .rodata
82
83// Test that only specifying required directives is allowed, and that defaulted
84// values are omitted.
85.p2align 6
86.amdhsa_kernel minimal
87  .amdhsa_next_free_vgpr 0
88  .amdhsa_next_free_sgpr 0
89.end_amdhsa_kernel
90
91// ASM: .amdhsa_kernel minimal
92// ASM: .amdhsa_next_free_vgpr 0
93// ASM-NEXT: .amdhsa_next_free_sgpr 0
94// ASM: .end_amdhsa_kernel
95
96// Test that we can specify all available directives with non-default values.
97.p2align 6
98.amdhsa_kernel complete
99  .amdhsa_group_segment_fixed_size 1
100  .amdhsa_private_segment_fixed_size 1
101  .amdhsa_kernarg_size 8
102  .amdhsa_user_sgpr_count 15
103  .amdhsa_user_sgpr_private_segment_buffer 1
104  .amdhsa_user_sgpr_dispatch_ptr 1
105  .amdhsa_user_sgpr_queue_ptr 1
106  .amdhsa_user_sgpr_kernarg_segment_ptr 1
107  .amdhsa_user_sgpr_dispatch_id 1
108  .amdhsa_user_sgpr_flat_scratch_init 1
109  .amdhsa_user_sgpr_private_segment_size 1
110  .amdhsa_uses_dynamic_stack 1
111  .amdhsa_system_sgpr_private_segment_wavefront_offset 1
112  .amdhsa_system_sgpr_workgroup_id_x 0
113  .amdhsa_system_sgpr_workgroup_id_y 1
114  .amdhsa_system_sgpr_workgroup_id_z 1
115  .amdhsa_system_sgpr_workgroup_info 1
116  .amdhsa_system_vgpr_workitem_id 1
117  .amdhsa_next_free_vgpr 9
118  .amdhsa_next_free_sgpr 27
119  .amdhsa_reserve_vcc 0
120  .amdhsa_reserve_flat_scratch 0
121  .amdhsa_reserve_xnack_mask 1
122  .amdhsa_float_round_mode_32 1
123  .amdhsa_float_round_mode_16_64 1
124  .amdhsa_float_denorm_mode_32 1
125  .amdhsa_float_denorm_mode_16_64 0
126  .amdhsa_dx10_clamp 0
127  .amdhsa_ieee_mode 0
128  .amdhsa_fp16_overflow 1
129  .amdhsa_exception_fp_ieee_invalid_op 1
130  .amdhsa_exception_fp_denorm_src 1
131  .amdhsa_exception_fp_ieee_div_zero 1
132  .amdhsa_exception_fp_ieee_overflow 1
133  .amdhsa_exception_fp_ieee_underflow 1
134  .amdhsa_exception_fp_ieee_inexact 1
135  .amdhsa_exception_int_div_zero 1
136.end_amdhsa_kernel
137
138// ASM: .amdhsa_kernel complete
139// ASM-NEXT: .amdhsa_group_segment_fixed_size 1
140// ASM-NEXT: .amdhsa_private_segment_fixed_size 1
141// ASM-NEXT: .amdhsa_kernarg_size 8
142// ASM-NEXT: .amdhsa_user_sgpr_count 15
143// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
144// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1
145// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1
146// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
147// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1
148// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
149// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1
150// ASM-NEXT: .amdhsa_uses_dynamic_stack 1
151// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
152// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0
153// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
154// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
155// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
156// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 1
157// ASM-NEXT: .amdhsa_next_free_vgpr 9
158// ASM-NEXT: .amdhsa_next_free_sgpr 27
159// ASM-NEXT: .amdhsa_reserve_vcc 0
160// ASM-NEXT: .amdhsa_reserve_flat_scratch 0
161// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
162// ASM-NEXT: .amdhsa_float_round_mode_32 1
163// ASM-NEXT: .amdhsa_float_round_mode_16_64 1
164// ASM-NEXT: .amdhsa_float_denorm_mode_32 1
165// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 0
166// ASM-NEXT: .amdhsa_dx10_clamp 0
167// ASM-NEXT: .amdhsa_ieee_mode 0
168// ASM-NEXT: .amdhsa_fp16_overflow 1
169// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
170// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
171// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
172// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
173// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
174// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
175// ASM-NEXT: .amdhsa_exception_int_div_zero 1
176// ASM-NEXT: .end_amdhsa_kernel
177
178// Test that we are including special SGPR usage in the granulated count.
179.p2align 6
180.amdhsa_kernel special_sgpr
181  // Same next_free_sgpr as "complete", but...
182  .amdhsa_next_free_sgpr 27
183  // ...on GFX9 this should require an additional 6 SGPRs, pushing us from
184  // 3 granules to 4
185  .amdhsa_reserve_flat_scratch 1
186
187  .amdhsa_reserve_vcc 0
188  .amdhsa_reserve_xnack_mask 1
189
190  .amdhsa_float_denorm_mode_16_64 0
191  .amdhsa_dx10_clamp 0
192  .amdhsa_ieee_mode 0
193  .amdhsa_next_free_vgpr 0
194.end_amdhsa_kernel
195
196// ASM: .amdhsa_kernel special_sgpr
197// ASM: .amdhsa_next_free_vgpr 0
198// ASM-NEXT: .amdhsa_next_free_sgpr 27
199// ASM-NEXT: .amdhsa_reserve_vcc 0
200// ASM-NEXT: .amdhsa_reserve_flat_scratch 1
201// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
202// ASM: .amdhsa_float_denorm_mode_16_64 0
203// ASM-NEXT: .amdhsa_dx10_clamp 0
204// ASM-NEXT: .amdhsa_ieee_mode 0
205// ASM: .end_amdhsa_kernel
206
207// Test that explicitly disabling user_sgpr's does not affect the user_sgpr
208// count, i.e. this should produce the same descriptor as minimal.
209.p2align 6
210.amdhsa_kernel disabled_user_sgpr
211  .amdhsa_user_sgpr_private_segment_buffer 0
212  .amdhsa_next_free_vgpr 0
213  .amdhsa_next_free_sgpr 0
214.end_amdhsa_kernel
215
216// ASM: .amdhsa_kernel disabled_user_sgpr
217// ASM: .amdhsa_next_free_vgpr 0
218// ASM-NEXT: .amdhsa_next_free_sgpr 0
219// ASM: .end_amdhsa_kernel
220
221.section .foo
222
223.byte .amdgcn.gfx_generation_number
224// ASM: .byte 9
225
226.byte .amdgcn.gfx_generation_minor
227// ASM: .byte 0
228
229.byte .amdgcn.gfx_generation_stepping
230// ASM: .byte 4
231
232.byte .amdgcn.next_free_vgpr
233// ASM: .byte 0
234.byte .amdgcn.next_free_sgpr
235// ASM: .byte 0
236
237v_mov_b32_e32 v7, s10
238
239.byte .amdgcn.next_free_vgpr
240// ASM: .byte 8
241.byte .amdgcn.next_free_sgpr
242// ASM: .byte 11
243
244.set .amdgcn.next_free_vgpr, 0
245.set .amdgcn.next_free_sgpr, 0
246
247.byte .amdgcn.next_free_vgpr
248// ASM: .byte 0
249.byte .amdgcn.next_free_sgpr
250// ASM: .byte 0
251
252v_mov_b32_e32 v16, s3
253
254.byte .amdgcn.next_free_vgpr
255// ASM: .byte 17
256.byte .amdgcn.next_free_sgpr
257// ASM: .byte 4
258
259// Metadata
260
261.amdgpu_metadata
262  amdhsa.version:
263    - 3
264    - 0
265  amdhsa.kernels:
266    - .name:       amd_kernel_code_t_test_all
267      .symbol: amd_kernel_code_t_test_all@kd
268      .kernarg_segment_size: 8
269      .group_segment_fixed_size: 16
270      .private_segment_fixed_size: 32
271      .uses_dynamic_stack: true
272      .kernarg_segment_align: 64
273      .wavefront_size: 128
274      .sgpr_count: 14
275      .vgpr_count: 40
276      .max_flat_workgroup_size: 256
277    - .name:       amd_kernel_code_t_minimal
278      .symbol: amd_kernel_code_t_minimal@kd
279      .kernarg_segment_size: 8
280      .group_segment_fixed_size: 16
281      .private_segment_fixed_size: 32
282      .uses_dynamic_stack: true
283      .kernarg_segment_align: 64
284      .wavefront_size: 128
285      .sgpr_count: 14
286      .vgpr_count: 40
287      .max_flat_workgroup_size: 256
288.end_amdgpu_metadata
289
290// ASM:      	.amdgpu_metadata
291// ASM:      amdhsa.kernels:
292// ASM:        - .group_segment_fixed_size: 16
293// ASM:          .kernarg_segment_align: 64
294// ASM:          .kernarg_segment_size: 8
295// ASM:          .max_flat_workgroup_size: 256
296// ASM:          .name:           amd_kernel_code_t_test_all
297// ASM:          .private_segment_fixed_size: 32
298// ASM:          .sgpr_count:     14
299// ASM:          .symbol:         'amd_kernel_code_t_test_all@kd'
300// ASM:          .uses_dynamic_stack: true
301// ASM:          .vgpr_count:     40
302// ASM:          .wavefront_size: 128
303// ASM:        - .group_segment_fixed_size: 16
304// ASM:          .kernarg_segment_align: 64
305// ASM:          .kernarg_segment_size: 8
306// ASM:          .max_flat_workgroup_size: 256
307// ASM:          .name:           amd_kernel_code_t_minimal
308// ASM:          .private_segment_fixed_size: 32
309// ASM:          .sgpr_count:     14
310// ASM:          .symbol:         'amd_kernel_code_t_minimal@kd'
311// ASM:          .uses_dynamic_stack: true
312// ASM:          .vgpr_count:     40
313// ASM:          .wavefront_size: 128
314// ASM:      amdhsa.version:
315// ASM-NEXT:   - 3
316// ASM-NEXT:   - 0
317// ASM:      	.end_amdgpu_metadata
318