xref: /llvm-project/llvm/test/CodeGen/AMDGPU/inline-asm.ll (revision 9843843c88f6cd8fa68c301ba751c001c254cb63)
1; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK %s
2; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck  --check-prefix=CHECK %s
3
4; CHECK-LABEL: {{^}}inline_asm:
5; CHECK: s_endpgm
6; CHECK: s_endpgm
7define amdgpu_kernel void @inline_asm(ptr addrspace(1) %out) {
8entry:
9  store i32 5, ptr addrspace(1) %out
10  call void asm sideeffect "s_endpgm", ""()
11  ret void
12}
13
14; CHECK-LABEL: {{^}}inline_asm_shader:
15; CHECK: s_endpgm
16; CHECK: s_endpgm
17define amdgpu_ps void @inline_asm_shader() {
18entry:
19  call void asm sideeffect "s_endpgm", ""()
20  ret void
21}
22
23
24; CHECK-LABEL: {{^}}branch_on_asm_vgpr:
25; Make sure VGPR inline assembly is treated as divergent.
26; CHECK: v_mov_b32 v{{[0-9]+}}, 0
27; CHECK: v_cmp_eq_u32
28; CHECK: s_and_saveexec_b64
29define amdgpu_kernel void @branch_on_asm_vgpr(ptr addrspace(1) %out) {
30	%zero = call i32 asm "v_mov_b32 $0, 0", "=v"()
31	%cmp = icmp eq i32 %zero, 0
32	br i1 %cmp, label %if, label %endif
33
34if:
35	store i32 0, ptr addrspace(1) %out
36	br label %endif
37
38endif:
39  ret void
40}
41
42; CHECK-LABEL: {{^}}branch_on_asm_sgpr:
43; Make sure SGPR inline assembly is treated as uniform
44; CHECK: s_mov_b32 s{{[0-9]+}}, 0
45; CHECK: s_cmp_lg_u32
46; CHECK: s_cbranch_scc0
47define amdgpu_kernel void @branch_on_asm_sgpr(ptr addrspace(1) %out) {
48	%zero = call i32 asm "s_mov_b32 $0, 0", "=s"()
49	%cmp = icmp eq i32 %zero, 0
50	br i1 %cmp, label %if, label %endif
51
52if:
53	store i32 0, ptr addrspace(1) %out
54	br label %endif
55
56endif:
57  ret void
58}
59
60; CHECK-LABEL: {{^}}v_cmp_asm:
61; CHECK: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
62; CHECK: v_cmp_ne_u32_e64 s[[[MASK_LO:[0-9]+]]:[[MASK_HI:[0-9]+]]], 0, [[SRC]]
63; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[MASK_LO]]
64; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[MASK_HI]]
65; CHECK: buffer_store_dwordx2 v[[[V_LO]]:[[V_HI]]]
66define amdgpu_kernel void @v_cmp_asm(ptr addrspace(1) %out, i32 %in) {
67  %sgpr = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 %in)
68  store i64 %sgpr, ptr addrspace(1) %out
69  ret void
70}
71
72; CHECK-LABEL: {{^}}code_size_inline_asm:
73; CHECK: codeLenInByte = 12
74define amdgpu_kernel void @code_size_inline_asm(ptr addrspace(1) %out) {
75entry:
76  call void asm sideeffect "v_nop_e64", ""()
77  ret void
78}
79
80; All inlineasm instructions are assumed to be the maximum size
81; CHECK-LABEL: {{^}}code_size_inline_asm_small_inst:
82; CHECK: codeLenInByte = 12
83define amdgpu_kernel void @code_size_inline_asm_small_inst(ptr addrspace(1) %out) {
84entry:
85  call void asm sideeffect "v_nop_e32", ""()
86  ret void
87}
88
89; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst:
90; CHECK: codeLenInByte = 20
91define amdgpu_kernel void @code_size_inline_asm_2_inst(ptr addrspace(1) %out) {
92entry:
93  call void asm sideeffect "
94    v_nop_e64
95    v_nop_e64
96   ", ""()
97  ret void
98}
99
100; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst_extra_newline:
101; CHECK: codeLenInByte = 20
102define amdgpu_kernel void @code_size_inline_asm_2_inst_extra_newline(ptr addrspace(1) %out) {
103entry:
104  call void asm sideeffect "
105    v_nop_e64
106
107    v_nop_e64
108   ", ""()
109  ret void
110}
111
112; CHECK-LABEL: {{^}}code_size_inline_asm_0_inst:
113; CHECK: codeLenInByte = 4
114define amdgpu_kernel void @code_size_inline_asm_0_inst(ptr addrspace(1) %out) {
115entry:
116  call void asm sideeffect "", ""()
117  ret void
118}
119
120; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment:
121; CHECK: codeLenInByte = 4
122define amdgpu_kernel void @code_size_inline_asm_1_comment(ptr addrspace(1) %out) {
123entry:
124  call void asm sideeffect "; comment", ""()
125  ret void
126}
127
128; CHECK-LABEL: {{^}}code_size_inline_asm_newline_1_comment:
129; CHECK: codeLenInByte = 4
130define amdgpu_kernel void @code_size_inline_asm_newline_1_comment(ptr addrspace(1) %out) {
131entry:
132  call void asm sideeffect "
133; comment", ""()
134  ret void
135}
136
137; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment_newline:
138; CHECK: codeLenInByte = 4
139define amdgpu_kernel void @code_size_inline_asm_1_comment_newline(ptr addrspace(1) %out) {
140entry:
141  call void asm sideeffect "; comment
142", ""()
143  ret void
144}
145
146; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line:
147; CHECK: codeLenInByte = 4
148define amdgpu_kernel void @code_size_inline_asm_2_comments_line(ptr addrspace(1) %out) {
149entry:
150  call void asm sideeffect "; first comment ; second comment", ""()
151  ret void
152}
153
154; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line_nospace:
155; CHECK: codeLenInByte = 4
156define amdgpu_kernel void @code_size_inline_asm_2_comments_line_nospace(ptr addrspace(1) %out) {
157entry:
158  call void asm sideeffect "; first comment;second comment", ""()
159  ret void
160}
161
162; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments0:
163; CHECK: codeLenInByte = 20
164define amdgpu_kernel void @code_size_inline_asm_mixed_comments0(ptr addrspace(1) %out) {
165entry:
166  call void asm sideeffect "; comment
167    v_nop_e64 ; inline comment
168; separate comment
169    v_nop_e64
170
171    ; trailing comment
172    ; extra comment
173  ", ""()
174  ret void
175}
176
177; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments1:
178; CHECK: codeLenInByte = 20
179define amdgpu_kernel void @code_size_inline_asm_mixed_comments1(ptr addrspace(1) %out) {
180entry:
181  call void asm sideeffect "v_nop_e64 ; inline comment
182; separate comment
183    v_nop_e64
184
185    ; trailing comment
186    ; extra comment
187  ", ""()
188  ret void
189}
190
191; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments_operands:
192; CHECK: codeLenInByte = 20
193define amdgpu_kernel void @code_size_inline_asm_mixed_comments_operands(ptr addrspace(1) %out) {
194entry:
195  call void asm sideeffect "; comment
196    v_add_i32_e32 v0, vcc, v1, v2 ; inline comment
197; separate comment
198    v_bfrev_b32_e32 v0, 1
199
200    ; trailing comment
201    ; extra comment
202  ", ""()
203  ret void
204}
205
206; FIXME: Should not have intermediate sgprs
207; CHECK-LABEL: {{^}}i64_imm_input_phys_vgpr:
208; CHECK: v_mov_b32_e32 v0, 0x1e240
209; CHECK: v_mov_b32_e32 v1, 0
210; CHECK: use v[0:1]
211define amdgpu_kernel void @i64_imm_input_phys_vgpr() {
212entry:
213  call void asm sideeffect "; use $0 ", "{v[0:1]}"(i64 123456)
214  ret void
215}
216
217; CHECK-LABEL: {{^}}i1_imm_input_phys_vgpr:
218; CHECK: v_mov_b32_e32 v0, 1{{$}}
219; CHECK: ; use v0
220define amdgpu_kernel void @i1_imm_input_phys_vgpr() {
221entry:
222  call void asm sideeffect "; use $0 ", "{v0}"(i1 true)
223  ret void
224}
225
226
227; FIXME: This behavior is nonsense. We should probably disallow i1 asm
228
229; CHECK-LABEL: {{^}}i1_input_phys_vgpr:
230; CHECK: {{buffer|flat}}_load_ubyte [[LOAD:v[0-9]+]]
231; CHECK-NOT: [[LOAD]]
232; CHECK: ; use v0
233; CHECK: v_and_b32_e32 [[STORE:v[0-9]+]], 1, v1
234; CHECK: {{buffer|flat}}_store_byte [[STORE]],
235define amdgpu_kernel void @i1_input_phys_vgpr() {
236entry:
237  %val = load i1, ptr addrspace(1) undef
238  %cc = call i1 asm sideeffect "; use $1, def $0 ", "={v1}, {v0}"(i1 %val)
239  store i1 %cc, ptr addrspace(1) undef
240  ret void
241}
242
243; FIXME: Should prodbably be masking high bits of load.
244; CHECK-LABEL: {{^}}i1_input_phys_vgpr_x2:
245; CHECK: buffer_load_ubyte v0
246; CHECK-NEXT: s_waitcnt
247; CHECK-NEXT: buffer_load_ubyte v1
248; CHECK-NEXT: s_waitcnt
249; CHECK-NEXT: ASMSTART
250define amdgpu_kernel void @i1_input_phys_vgpr_x2() {
251entry:
252  %val0 = load volatile i1, ptr addrspace(1) undef
253  %val1 = load volatile i1, ptr addrspace(1) undef
254  call void asm sideeffect "; use $0 $1 ", "{v0}, {v1}"(i1 %val0, i1 %val1)
255  ret void
256}
257
258; CHECK-LABEL: {{^}}muliple_def_phys_vgpr:
259; CHECK: ; def v0
260; CHECK: v_mov_b32_e32 v1, v0
261; CHECK: ; def v0
262; CHECK: v_lshlrev_b32_e32 v{{[0-9]+}}, v0, v1
263define amdgpu_kernel void @muliple_def_phys_vgpr() {
264entry:
265  %def0 = call i32 asm sideeffect "; def $0 ", "={v0}"()
266  %def1 = call i32 asm sideeffect "; def $0 ", "={v0}"()
267  %add = shl i32 %def0, %def1
268  store i32 %add, ptr addrspace(1) undef
269  ret void
270}
271
272; CHECK-LABEL: {{^}}asm_constraint_c_n:
273; CHECK: s_trap 10{{$}}
274define amdgpu_kernel void @asm_constraint_c_n()  {
275entry:
276  tail call void asm sideeffect "s_trap ${0:c}", "n"(i32 10) #1
277  ret void
278}
279
280; CHECK-LABEL: {{^}}asm_constraint_n_n:
281; CHECK: s_trap -10{{$}}
282define amdgpu_kernel void @asm_constraint_n_n()  {
283entry:
284  tail call void asm sideeffect "s_trap ${0:n}", "n"(i32 10) #1
285  ret void
286}
287
288; Make sure tuples of 3 SGPRs are printed with the [] syntax instead
289; of the tablegen default.
290; CHECK-LABEL: {{^}}sgpr96_name_format:
291; CHECK: ; sgpr96 s[0:2]
292define amdgpu_kernel void @sgpr96_name_format()  {
293entry:
294  tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> <i32 10, i32 11, i32 12>) #1
295  ret void
296}
297
298; Check aggregate types are handled properly.
299; CHECK-LABEL: mad_u64
300; CHECK: v_mad_u64_u32
301define void @mad_u64(i32 %x, i1 %c0) {
302entry:
303  br i1 %c0, label %exit, label %false
304
305false:
306  %s0 = tail call { i64, i64 } asm sideeffect "v_mad_u64_u32 $0, $1, $2, $3, $4", "=v,=s,v,v,v"(i32 -766435501, i32 %x, i64 0)
307  br label %exit
308
309exit:
310  %s1 = phi { i64, i64} [ undef, %entry ], [ %s0, %false]
311  %v0 = extractvalue { i64, i64 } %s1, 0
312  %v1 = extractvalue { i64, i64 } %s1, 1
313  tail call void asm sideeffect "; use $0", "v"(i64 %v0)
314  tail call void asm sideeffect "; use $0", "v"(i64 %v1)
315  ret void
316}
317
318; CHECK-LABEL: {{^}}scc_as_i32:
319; CHECK: ; def scc
320; CHECK: ; use scc
321define void @scc_as_i32() {
322  %scc = call i32 asm sideeffect "; def $0", "={scc}"()
323  call void asm sideeffect "; use $0 ", "{scc}"(i32 %scc)
324  ret void
325}
326
327; CHECK-LABEL: {{^}}scc_as_i1:
328; CHECK: ; def scc
329; CHECK: ; use scc
330define void @scc_as_i1() {
331  %scc = call i1 asm sideeffect "; def $0", "={scc}"()
332  call void asm sideeffect "; use $0 ", "{scc}"(i1 %scc)
333  ret void
334}
335
336; Make sure the SGPR def is treated as a uniform value when the inline
337; assembly also defines a divergent value. The add should be scalar
338; and not introduce illegal vgpr to sgpr copies.
339; CHECK-LABEL: {{^}}mixed_def_vgpr_sgpr_def_asm:
340; CHECK: ; def v0 s[4:5]
341; CHECK: s_add_u32
342; CHECK-NEXT: s_addc_u32
343; CHECK: ; use s[4:5]
344define void @mixed_def_vgpr_sgpr_def_asm() {
345  %vgpr_sgpr = call { i32, i64 } asm sideeffect "; def $0 $1 ", "=v,={s[4:5]}"()
346  %vgpr = extractvalue { i32, i64 } %vgpr_sgpr, 0
347  %sgpr = extractvalue { i32, i64 } %vgpr_sgpr, 1
348  %sgpr.add = add i64 %sgpr, 2
349  call void asm sideeffect "; use $0 ", "{s[4:5]}"(i64 %sgpr.add)
350  ret void
351}
352
353; CHECK-LABEL: {{^}}mixed_def_sgpr_vgpr_def_asm:
354; CHECK: ; def s[4:5] v0
355; CHECK: s_add_u32
356; CHECK-NEXT: s_addc_u32
357; CHECK: ; use s[4:5]
358define void @mixed_def_sgpr_vgpr_def_asm() {
359  %sgpr_vgpr = call { i64, i32 } asm sideeffect "; def $0 $1 ", "={s[4:5]},=v"()
360  %sgpr = extractvalue { i64, i32 } %sgpr_vgpr, 0
361  %vgpr = extractvalue { i64, i32 } %sgpr_vgpr, 1
362  %sgpr.add = add i64 %sgpr, 2
363  call void asm sideeffect "; use $0 ", "{s[4:5]}"(i64 %sgpr.add)
364  ret void
365}
366