xref: /llvm-project/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll (revision fe8335babba1725e18d6ea94073c3dbb92958bfa)
1; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI -check-prefix=CI-NOHSA %s
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s
4
5declare i32 @llvm.amdgcn.workitem.id.x() #0
6declare i32 @llvm.amdgcn.workitem.id.y() #0
7
8; In this test both the pointer and the offset operands to the
9; BUFFER_LOAD instructions end up being stored in vgprs.  This
10; requires us to add the pointer and offset together, store the
11; result in the offset operand (vaddr), and then store 0 in an
12; sgpr register pair and use that for the pointer operand
13; (low 64-bits of srsrc).
14
15; GCN-LABEL: {{^}}mubuf:
16
17; Make sure we aren't using VGPRs for the source operand of s_mov_b64
18; GCN-NOT: s_mov_b64 s[{{[0-9]+:[0-9]+}}], v
19
20; Make sure we aren't using VGPR's for the srsrc operand of BUFFER_LOAD_*
21; instructions
22; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
23; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
24; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
25; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
26
27define amdgpu_kernel void @mubuf(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
28entry:
29  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
30  %tmp1 = call i32 @llvm.amdgcn.workitem.id.y()
31  %tmp2 = sext i32 %tmp to i64
32  %tmp3 = sext i32 %tmp1 to i64
33  br label %loop
34
35loop:                                             ; preds = %loop, %entry
36  %tmp4 = phi i64 [ 0, %entry ], [ %tmp5, %loop ]
37  %tmp5 = add i64 %tmp2, %tmp4
38  %tmp6 = getelementptr i8, ptr addrspace(1) %in, i64 %tmp5
39  %tmp7 = load i8, ptr addrspace(1) %tmp6, align 1
40  %tmp8 = or i64 %tmp5, 1
41  %tmp9 = getelementptr i8, ptr addrspace(1) %in, i64 %tmp8
42  %tmp10 = load i8, ptr addrspace(1) %tmp9, align 1
43  %tmp11 = add i8 %tmp7, %tmp10
44  %tmp12 = sext i8 %tmp11 to i32
45  store i32 %tmp12, ptr addrspace(1) %out
46  %tmp13 = icmp slt i64 %tmp5, 10
47  br i1 %tmp13, label %loop, label %done
48
49done:                                             ; preds = %loop
50  ret void
51}
52
53; Test moving an SMRD instruction to the VALU
54; FIXME: movs can be moved before nop to reduce count
55
56; GCN-LABEL: {{^}}smrd_valu:
57; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x2ee0
58; GCN: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
59; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
60; SI-DAG: s_mov_b32
61; SI-DAG: s_load_dword [[OUT:s[0-9]+]], s[[[PTR_LO]]:[[PTR_HI]]], [[OFFSET]]
62
63; CI: s_load_dword [[OUT:s[0-9]+]], s[[[PTR_LO]]:[[PTR_HI]]], 0xbb8
64; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]]
65; GCN-NOHSA: buffer_store_dword [[V_OUT]]
66; GCN-HSA: flat_store_dword {{.*}}, [[V_OUT]]
67define amdgpu_kernel void @smrd_valu(ptr addrspace(1) %in, i32 %a, i32 %b, ptr addrspace(1) %out) #1 {
68entry:
69  %tmp = icmp ne i32 %a, 0
70  br i1 %tmp, label %if, label %else
71
72if:                                               ; preds = %entry
73  %tmp1 = load ptr addrspace(4), ptr addrspace(1) %in
74  br label %endif
75
76else:                                             ; preds = %entry
77  %tmp2 = getelementptr ptr addrspace(4), ptr addrspace(1) %in
78  %tmp3 = load ptr addrspace(4), ptr addrspace(1) %tmp2
79  br label %endif
80
81endif:                                            ; preds = %else, %if
82  %tmp4 = phi ptr addrspace(4) [ %tmp1, %if ], [ %tmp3, %else ]
83  %tmp5 = getelementptr i32, ptr addrspace(4) %tmp4, i32 3000
84  %tmp6 = load i32, ptr addrspace(4) %tmp5
85  store i32 %tmp6, ptr addrspace(1) %out
86  ret void
87}
88
89; Test moving an SMRD with an immediate offset to the VALU
90
91; GCN-LABEL: {{^}}smrd_valu2:
92; GCN-NOHSA-NOT: v_add
93; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}}
94; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
95define amdgpu_kernel void @smrd_valu2(ptr addrspace(1) %out, ptr addrspace(4) %in) #1 {
96entry:
97  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
98  %tmp1 = add i32 %tmp, 4
99  %tmp2 = getelementptr [8 x i32], ptr addrspace(4) %in, i32 %tmp, i32 4
100  %tmp3 = load i32, ptr addrspace(4) %tmp2
101  store i32 %tmp3, ptr addrspace(1) %out
102  ret void
103}
104
105; Use a big offset that will use the SMRD literal offset on CI
106; GCN-LABEL: {{^}}smrd_valu_ci_offset:
107; GCN-NOHSA-NOT: v_add
108; GCN-NOHSA: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4e20{{$}}
109; GCN-NOHSA-NOT: v_add
110; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
111; GCN-NOHSA: v_add_i32_e32
112; GCN-NOHSA: buffer_store_dword
113; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
114; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
115define amdgpu_kernel void @smrd_valu_ci_offset(ptr addrspace(1) %out, ptr addrspace(4) %in, i32 %c) #1 {
116entry:
117  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
118  %tmp2 = getelementptr i32, ptr addrspace(4) %in, i32 %tmp
119  %tmp3 = getelementptr i32, ptr addrspace(4) %tmp2, i32 5000
120  %tmp4 = load i32, ptr addrspace(4) %tmp3
121  %tmp5 = add i32 %tmp4, %c
122  store i32 %tmp5, ptr addrspace(1) %out
123  ret void
124}
125
126; GCN-LABEL: {{^}}smrd_valu_ci_offset_x2:
127; GCN-NOHSA-NOT: v_add
128; GCN-NOHSA: s_mov_b32 [[OFFSET:s[0-9]+]], 0x9c40{{$}}
129; GCN-NOHSA-NOT: v_add
130; GCN-NOHSA: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
131; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
132; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
133; GCN-NOHSA: buffer_store_dwordx2
134; GCN-HSA: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
135define amdgpu_kernel void @smrd_valu_ci_offset_x2(ptr addrspace(1) %out, ptr addrspace(4) %in, i64 %c) #1 {
136entry:
137  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
138  %tmp2 = getelementptr i64, ptr addrspace(4) %in, i32 %tmp
139  %tmp3 = getelementptr i64, ptr addrspace(4) %tmp2, i32 5000
140  %tmp4 = load i64, ptr addrspace(4) %tmp3
141  %tmp5 = or i64 %tmp4, %c
142  store i64 %tmp5, ptr addrspace(1) %out
143  ret void
144}
145
146; GCN-LABEL: {{^}}smrd_valu_ci_offset_x4:
147; GCN-NOHSA-NOT: v_add
148; GCN-NOHSA: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4d20{{$}}
149; GCN-NOHSA-NOT: v_add
150; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
151; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
152; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
153; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
154; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
155; GCN-NOHSA: buffer_store_dwordx4
156; GCN-HSA: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
157define amdgpu_kernel void @smrd_valu_ci_offset_x4(ptr addrspace(1) %out, ptr addrspace(4) %in, <4 x i32> %c) #1 {
158entry:
159  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
160  %tmp2 = getelementptr <4 x i32>, ptr addrspace(4) %in, i32 %tmp
161  %tmp3 = getelementptr <4 x i32>, ptr addrspace(4) %tmp2, i32 1234
162  %tmp4 = load <4 x i32>, ptr addrspace(4) %tmp3
163  %tmp5 = or <4 x i32> %tmp4, %c
164  store <4 x i32> %tmp5, ptr addrspace(1) %out
165  ret void
166}
167
168; Original scalar load uses SGPR offset on SI and 32-bit literal on
169; CI.
170
171; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8:
172; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
173; CI-NOHSA-NOT: v_add
174; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
175; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
176; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
177; GCN-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
178
179; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
180; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
181; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
182; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
183; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
184; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
185; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
186; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
187; GCN-NOHSA: buffer_store_dwordx4
188; GCN-NOHSA: buffer_store_dwordx4
189; GCN-HSA: flat_load_dwordx4
190; GCN-HSA: flat_load_dwordx4
191define amdgpu_kernel void @smrd_valu_ci_offset_x8(ptr addrspace(1) %out, ptr addrspace(4) %in, <8 x i32> %c) #1 {
192entry:
193  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
194  %tmp2 = getelementptr <8 x i32>, ptr addrspace(4) %in, i32 %tmp
195  %tmp3 = getelementptr <8 x i32>, ptr addrspace(4) %tmp2, i32 1234
196  %tmp4 = load <8 x i32>, ptr addrspace(4) %tmp3
197  %tmp5 = or <8 x i32> %tmp4, %c
198  store <8 x i32> %tmp5, ptr addrspace(1) %out
199  ret void
200}
201
202; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16:
203
204; SI-DAG: s_mov_b64 s[{{[0-9:]+}}], 0x13480
205; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
206; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32
207; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48
208; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], {{s[0-9]+}} addr64
209; CI-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}}
210; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
211; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}}
212; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
213; CI-NOHSA-DAG: s_mov_b32 [[OFFSET2:s[0-9]+]], 0x134a0{{$}}
214; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET2]] addr64{{$}}
215; CI-NOHSA-DAG: s_mov_b32 [[OFFSET3:s[0-9]+]], 0x134b0{{$}}
216; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET3]] addr64{{$}}
217
218; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
219; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
220; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
221; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
222; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
223; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
224; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
225; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
226; GCN-NOHSA: buffer_store_dwordx4
227; GCN-NOHSA: buffer_store_dwordx4
228; GCN-NOHSA: buffer_store_dwordx4
229; GCN-NOHSA: buffer_store_dwordx4
230
231; GCN-HSA: flat_load_dwordx4
232; GCN-HSA: flat_load_dwordx4
233; GCN-HSA: flat_load_dwordx4
234; GCN-HSA: flat_load_dwordx4
235
236; GCN: s_endpgm
237define amdgpu_kernel void @smrd_valu_ci_offset_x16(ptr addrspace(1) %out, ptr addrspace(4) %in, <16 x i32> %c) #1 {
238entry:
239  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
240  %tmp2 = getelementptr <16 x i32>, ptr addrspace(4) %in, i32 %tmp
241  %tmp3 = getelementptr <16 x i32>, ptr addrspace(4) %tmp2, i32 1234
242  %tmp4 = load <16 x i32>, ptr addrspace(4) %tmp3
243  %tmp5 = or <16 x i32> %tmp4, %c
244  store <16 x i32> %tmp5, ptr addrspace(1) %out
245  ret void
246}
247
248; GCN-LABEL: {{^}}smrd_valu2_salu_user:
249; GCN-NOHSA: buffer_load_dword [[MOVED:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
250; GCN-HSA: flat_load_dword [[MOVED:v[0-9]+]], v[{{[0-9+:[0-9]+}}]
251; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[MOVED]]
252; GCN-NOHSA: buffer_store_dword [[ADD]]
253; GCN-HSA: flat_store_dword {{.*}}, [[ADD]]
254define amdgpu_kernel void @smrd_valu2_salu_user(ptr addrspace(1) %out, ptr addrspace(4) %in, i32 %a) #1 {
255entry:
256  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
257  %tmp1 = add i32 %tmp, 4
258  %tmp2 = getelementptr [8 x i32], ptr addrspace(4) %in, i32 %tmp, i32 4
259  %tmp3 = load i32, ptr addrspace(4) %tmp2
260  %tmp4 = add i32 %tmp3, %a
261  store i32 %tmp4, ptr addrspace(1) %out
262  ret void
263}
264
265; GCN-LABEL: {{^}}smrd_valu2_max_smrd_offset:
266; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}}
267; GCN-HSA: flat_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}]
268define amdgpu_kernel void @smrd_valu2_max_smrd_offset(ptr addrspace(1) %out, ptr addrspace(4) %in) #1 {
269entry:
270  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
271  %tmp1 = add i32 %tmp, 4
272  %tmp2 = getelementptr [1024 x i32], ptr addrspace(4) %in, i32 %tmp, i32 255
273  %tmp3 = load i32, ptr addrspace(4) %tmp2
274  store i32 %tmp3, ptr addrspace(1) %out
275  ret void
276}
277
278; GCN-LABEL: {{^}}smrd_valu2_mubuf_offset:
279; GCN-NOHSA-NOT: v_add
280; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}}
281; GCN-HSA: flat_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}]
282define amdgpu_kernel void @smrd_valu2_mubuf_offset(ptr addrspace(1) %out, ptr addrspace(4) %in) #1 {
283entry:
284  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
285  %tmp1 = add i32 %tmp, 4
286  %tmp2 = getelementptr [1024 x i32], ptr addrspace(4) %in, i32 %tmp, i32 256
287  %tmp3 = load i32, ptr addrspace(4) %tmp2
288  store i32 %tmp3, ptr addrspace(1) %out
289  ret void
290}
291
292; GCN-LABEL: {{^}}s_load_imm_v8i32:
293; GCN-NOHSA: buffer_load_dwordx4
294; GCN-NOHSA: buffer_load_dwordx4
295; GCN-HSA: flat_load_dwordx4
296; GCN-HSA: flat_load_dwordx4
297define amdgpu_kernel void @s_load_imm_v8i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture readonly %in) #1 {
298entry:
299  %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
300  %tmp1 = getelementptr inbounds i32, ptr addrspace(4) %in, i32 %tmp0
301  %tmp3 = load <8 x i32>, ptr addrspace(4) %tmp1, align 4
302  store <8 x i32> %tmp3, ptr addrspace(1) %out, align 32
303  ret void
304}
305
306; GCN-LABEL: {{^}}s_load_imm_v8i32_salu_user:
307; GCN-NOHSA: buffer_load_dwordx4
308; GCN-NOHSA: buffer_load_dwordx4
309; GCN-NOHSA: v_add_i32_e32
310; GCN-NOHSA: v_add_i32_e32
311; GCN-NOHSA: v_add_i32_e32
312; GCN-NOHSA: v_add_i32_e32
313; GCN-NOHSA: v_add_i32_e32
314; GCN-NOHSA: v_add_i32_e32
315; GCN-NOHSA: v_add_i32_e32
316; GCN-NOHSA: buffer_store_dword
317; GCN-HSA: flat_load_dwordx4
318; GCN-HSA: flat_load_dwordx4
319define amdgpu_kernel void @s_load_imm_v8i32_salu_user(ptr addrspace(1) %out, ptr addrspace(4) nocapture readonly %in) #1 {
320entry:
321  %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
322  %tmp1 = getelementptr inbounds i32, ptr addrspace(4) %in, i32 %tmp0
323  %tmp3 = load <8 x i32>, ptr addrspace(4) %tmp1, align 4
324
325  %elt0 = extractelement <8 x i32> %tmp3, i32 0
326  %elt1 = extractelement <8 x i32> %tmp3, i32 1
327  %elt2 = extractelement <8 x i32> %tmp3, i32 2
328  %elt3 = extractelement <8 x i32> %tmp3, i32 3
329  %elt4 = extractelement <8 x i32> %tmp3, i32 4
330  %elt5 = extractelement <8 x i32> %tmp3, i32 5
331  %elt6 = extractelement <8 x i32> %tmp3, i32 6
332  %elt7 = extractelement <8 x i32> %tmp3, i32 7
333
334  %add0 = add i32 %elt0, %elt1
335  %add1 = add i32 %add0, %elt2
336  %add2 = add i32 %add1, %elt3
337  %add3 = add i32 %add2, %elt4
338  %add4 = add i32 %add3, %elt5
339  %add5 = add i32 %add4, %elt6
340  %add6 = add i32 %add5, %elt7
341
342  store i32 %add6, ptr addrspace(1) %out
343  ret void
344}
345
346; GCN-LABEL: {{^}}s_load_imm_v16i32:
347; GCN-NOHSA: buffer_load_dwordx4
348; GCN-NOHSA: buffer_load_dwordx4
349; GCN-NOHSA: buffer_load_dwordx4
350; GCN-NOHSA: buffer_load_dwordx4
351; GCN-HSA: flat_load_dwordx4
352; GCN-HSA: flat_load_dwordx4
353; GCN-HSA: flat_load_dwordx4
354; GCN-HSA: flat_load_dwordx4
355define amdgpu_kernel void @s_load_imm_v16i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture readonly %in) #1 {
356entry:
357  %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
358  %tmp1 = getelementptr inbounds i32, ptr addrspace(4) %in, i32 %tmp0
359  %tmp3 = load <16 x i32>, ptr addrspace(4) %tmp1, align 4
360  store <16 x i32> %tmp3, ptr addrspace(1) %out, align 32
361  ret void
362}
363
364; GCN-LABEL: {{^}}s_load_imm_v16i32_salu_user:
365; GCN-NOHSA: buffer_load_dwordx4
366; GCN-NOHSA: buffer_load_dwordx4
367; GCN-NOHSA: buffer_load_dwordx4
368; GCN-NOHSA: buffer_load_dwordx4
369; GCN-NOHSA: v_add_i32_e32
370; GCN-NOHSA: v_add_i32_e32
371; GCN-NOHSA: v_add_i32_e32
372; GCN-NOHSA: v_add_i32_e32
373; GCN-NOHSA: v_add_i32_e32
374; GCN-NOHSA: v_add_i32_e32
375; GCN-NOHSA: v_add_i32_e32
376; GCN-NOHSA: v_add_i32_e32
377; GCN-NOHSA: v_add_i32_e32
378; GCN-NOHSA: v_add_i32_e32
379; GCN-NOHSA: v_add_i32_e32
380; GCN-NOHSA: v_add_i32_e32
381; GCN-NOHSA: v_add_i32_e32
382; GCN-NOHSA: v_add_i32_e32
383; GCN-NOHSA: v_add_i32_e32
384; GCN-NOHSA: buffer_store_dword
385; GCN-HSA: flat_load_dwordx4
386; GCN-HSA: flat_load_dwordx4
387; GCN-HSA: flat_load_dwordx4
388; GCN-HSA: flat_load_dwordx4
389define amdgpu_kernel void @s_load_imm_v16i32_salu_user(ptr addrspace(1) %out, ptr addrspace(4) nocapture readonly %in) #1 {
390entry:
391  %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
392  %tmp1 = getelementptr inbounds i32, ptr addrspace(4) %in, i32 %tmp0
393  %tmp3 = load <16 x i32>, ptr addrspace(4) %tmp1, align 4
394
395  %elt0 = extractelement <16 x i32> %tmp3, i32 0
396  %elt1 = extractelement <16 x i32> %tmp3, i32 1
397  %elt2 = extractelement <16 x i32> %tmp3, i32 2
398  %elt3 = extractelement <16 x i32> %tmp3, i32 3
399  %elt4 = extractelement <16 x i32> %tmp3, i32 4
400  %elt5 = extractelement <16 x i32> %tmp3, i32 5
401  %elt6 = extractelement <16 x i32> %tmp3, i32 6
402  %elt7 = extractelement <16 x i32> %tmp3, i32 7
403  %elt8 = extractelement <16 x i32> %tmp3, i32 8
404  %elt9 = extractelement <16 x i32> %tmp3, i32 9
405  %elt10 = extractelement <16 x i32> %tmp3, i32 10
406  %elt11 = extractelement <16 x i32> %tmp3, i32 11
407  %elt12 = extractelement <16 x i32> %tmp3, i32 12
408  %elt13 = extractelement <16 x i32> %tmp3, i32 13
409  %elt14 = extractelement <16 x i32> %tmp3, i32 14
410  %elt15 = extractelement <16 x i32> %tmp3, i32 15
411
412  %add0 = add i32 %elt0, %elt1
413  %add1 = add i32 %add0, %elt2
414  %add2 = add i32 %add1, %elt3
415  %add3 = add i32 %add2, %elt4
416  %add4 = add i32 %add3, %elt5
417  %add5 = add i32 %add4, %elt6
418  %add6 = add i32 %add5, %elt7
419  %add7 = add i32 %add6, %elt8
420  %add8 = add i32 %add7, %elt9
421  %add9 = add i32 %add8, %elt10
422  %add10 = add i32 %add9, %elt11
423  %add11 = add i32 %add10, %elt12
424  %add12 = add i32 %add11, %elt13
425  %add13 = add i32 %add12, %elt14
426  %add14 = add i32 %add13, %elt15
427
428  store i32 %add14, ptr addrspace(1) %out
429  ret void
430}
431
432; Make sure we legalize vopc operands after moving an sopc to the value.
433
434; {{^}}sopc_vopc_legalize_bug:
435; GCN: s_load_dword [[SGPR:s[0-9]+]]
436; GCN: v_cmp_le_u32_e32 vcc, [[SGPR]], v{{[0-9]+}}
437; GCN: s_cbranch_vccnz [[EXIT:.L[A-Z0-9_]+]]
438; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
439; GCN-NOHSA: buffer_store_dword [[ONE]]
440; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
441; GCN: {{^}}[[EXIT]]:
442; GCN: s_endpgm
443define amdgpu_kernel void @sopc_vopc_legalize_bug(i32 %cond, ptr addrspace(1) %out, ptr addrspace(1) %in) {
444bb3:                                              ; preds = %bb2
445  %tmp0 = bitcast i32 %cond to float
446  %tmp1 = fadd float %tmp0, 2.500000e-01
447  %tmp2 = bitcast float %tmp1 to i32
448  %tmp3 = icmp ult i32 %tmp2, %cond
449  br i1 %tmp3, label %bb6, label %bb7
450
451bb6:
452  store i32 1, ptr addrspace(1) %out
453  br label %bb7
454
455bb7:                                              ; preds = %bb3
456  ret void
457}
458
459; GCN-LABEL: {{^}}phi_visit_order:
460; GCN: v_add_i32_e64 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 1, v{{[0-9]+}}
461define amdgpu_kernel void @phi_visit_order() {
462bb:
463  br label %bb1
464
465bb1:
466  %tmp = phi i32 [ 0, %bb ], [ %tmp5, %bb4 ]
467  %tid = call i32 @llvm.amdgcn.workitem.id.x()
468  %cnd = icmp eq i32 %tid, 0
469  br i1 %cnd, label %bb4, label %bb2
470
471bb2:
472  %tmp3 = add nsw i32 %tmp, 1
473  br label %bb4
474
475bb4:
476  %tmp5 = phi i32 [ %tmp3, %bb2 ], [ %tmp, %bb1 ]
477  store volatile i32 %tmp5, ptr addrspace(1) undef
478  br label %bb1
479}
480
481; GCN-LABEL: {{^}}phi_imm_in_sgprs
482; GCN: s_movk_i32 [[B:s[0-9]+]], 0x400
483; GCN: [[LOOP_LABEL:.L[0-9a-zA-Z_]+]]:
484; GCN: s_xor_b32 [[B]], [[B]], 0x400
485; GCN: s_cbranch_scc{{[01]}} [[LOOP_LABEL]]
486define amdgpu_kernel void @phi_imm_in_sgprs(ptr addrspace(3) %out, i32 %cond) {
487entry:
488  br label %loop
489
490loop:
491  %i = phi i32 [0, %entry], [%i.add, %loop]
492  %offset = phi i32 [1024, %entry], [%offset.xor, %loop]
493  %offset.xor = xor i32 %offset, 1024
494  %offset.i = add i32 %offset.xor, %i
495  %ptr = getelementptr i32, ptr addrspace(3) %out, i32 %offset.i
496  store i32 0, ptr addrspace(3) %ptr
497  %i.add = add i32 %i, 1
498  %cmp = icmp ult i32 %i.add, %cond
499  br i1 %cmp, label %loop, label %exit
500
501exit:
502  ret void
503}
504
505attributes #0 = { nounwind readnone }
506attributes #1 = { nounwind }
507