xref: /llvm-project/llvm/test/CodeGen/AMDGPU/wave32.ll (revision 11b040192640ef3b1f481124c440f464ed6ec86a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-early-ifcvt=1 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-early-ifcvt=1 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032,GFX10DEFWAVE %s
7
8define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) {
9; GFX1032-LABEL: test_vopc_i32:
10; GFX1032:       ; %bb.0:
11; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
12; GFX1032-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
13; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
14; GFX1032-NEXT:    global_load_dword v1, v0, s[0:1]
15; GFX1032-NEXT:    s_waitcnt vmcnt(0)
16; GFX1032-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0, v1
17; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 2, 1, vcc_lo
18; GFX1032-NEXT:    global_store_dword v0, v1, s[0:1]
19; GFX1032-NEXT:    s_endpgm
20;
21; GFX1064-LABEL: test_vopc_i32:
22; GFX1064:       ; %bb.0:
23; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
24; GFX1064-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
25; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
26; GFX1064-NEXT:    global_load_dword v1, v0, s[0:1]
27; GFX1064-NEXT:    s_waitcnt vmcnt(0)
28; GFX1064-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v1
29; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 2, 1, vcc
30; GFX1064-NEXT:    global_store_dword v0, v1, s[0:1]
31; GFX1064-NEXT:    s_endpgm
32  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
33  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid
34  %load = load i32, ptr addrspace(1) %gep, align 4
35  %cmp = icmp sgt i32 %load, 0
36  %sel = select i1 %cmp, i32 1, i32 2
37  store i32 %sel, ptr addrspace(1) %gep, align 4
38  ret void
39}
40
41define amdgpu_kernel void @test_vopc_f32(ptr addrspace(1) %arg) {
42; GFX1032-LABEL: test_vopc_f32:
43; GFX1032:       ; %bb.0:
44; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
45; GFX1032-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
46; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
47; GFX1032-NEXT:    global_load_dword v1, v0, s[0:1]
48; GFX1032-NEXT:    s_waitcnt vmcnt(0)
49; GFX1032-NEXT:    v_cmp_nge_f32_e32 vcc_lo, 0, v1
50; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 2.0, 1.0, vcc_lo
51; GFX1032-NEXT:    global_store_dword v0, v1, s[0:1]
52; GFX1032-NEXT:    s_endpgm
53;
54; GFX1064-LABEL: test_vopc_f32:
55; GFX1064:       ; %bb.0:
56; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
57; GFX1064-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
58; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
59; GFX1064-NEXT:    global_load_dword v1, v0, s[0:1]
60; GFX1064-NEXT:    s_waitcnt vmcnt(0)
61; GFX1064-NEXT:    v_cmp_nge_f32_e32 vcc, 0, v1
62; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 2.0, 1.0, vcc
63; GFX1064-NEXT:    global_store_dword v0, v1, s[0:1]
64; GFX1064-NEXT:    s_endpgm
65  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
66  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %lid
67  %load = load float, ptr addrspace(1) %gep, align 4
68  %cmp = fcmp ugt float %load, 0.0
69  %sel = select i1 %cmp, float 1.0, float 2.0
70  store float %sel, ptr addrspace(1) %gep, align 4
71  ret void
72}
73
74define amdgpu_ps void @test_vopc_vcmp(float %x) {
75; GFX1032-LABEL: test_vopc_vcmp:
76; GFX1032:       ; %bb.0:
77; GFX1032-NEXT:    v_cmp_nle_f32_e32 vcc_lo, 0, v0
78; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, vcc_lo
79; GFX1032-NEXT:    s_cbranch_scc0 .LBB2_1
80; GFX1032-NEXT:    s_endpgm
81; GFX1032-NEXT:  .LBB2_1:
82; GFX1032-NEXT:    s_mov_b32 exec_lo, 0
83; GFX1032-NEXT:    exp null off, off, off, off done vm
84; GFX1032-NEXT:    s_endpgm
85;
86; GFX1064-LABEL: test_vopc_vcmp:
87; GFX1064:       ; %bb.0:
88; GFX1064-NEXT:    v_cmp_nle_f32_e32 vcc, 0, v0
89; GFX1064-NEXT:    s_andn2_b64 exec, exec, vcc
90; GFX1064-NEXT:    s_cbranch_scc0 .LBB2_1
91; GFX1064-NEXT:    s_endpgm
92; GFX1064-NEXT:  .LBB2_1:
93; GFX1064-NEXT:    s_mov_b64 exec, 0
94; GFX1064-NEXT:    exp null off, off, off, off done vm
95; GFX1064-NEXT:    s_endpgm
96  %cmp = fcmp oge float %x, 0.0
97  call void @llvm.amdgcn.kill(i1 %cmp)
98  ret void
99}
100
101define amdgpu_kernel void @test_vopc_2xf16(ptr addrspace(1) %arg) {
102; GFX1032-LABEL: test_vopc_2xf16:
103; GFX1032:       ; %bb.0:
104; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
105; GFX1032-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
106; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
107; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
108; GFX1032-NEXT:    global_load_dword v1, v0, s[0:1]
109; GFX1032-NEXT:    s_waitcnt vmcnt(0)
110; GFX1032-NEXT:    v_cmp_le_f16_sdwa vcc_lo, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
111; GFX1032-NEXT:    v_cndmask_b32_e32 v1, 0x3c003c00, v1, vcc_lo
112; GFX1032-NEXT:    global_store_dword v0, v1, s[0:1]
113; GFX1032-NEXT:    s_endpgm
114;
115; GFX1064-LABEL: test_vopc_2xf16:
116; GFX1064:       ; %bb.0:
117; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
118; GFX1064-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
119; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
120; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
121; GFX1064-NEXT:    global_load_dword v1, v0, s[0:1]
122; GFX1064-NEXT:    s_waitcnt vmcnt(0)
123; GFX1064-NEXT:    v_cmp_le_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
124; GFX1064-NEXT:    v_cndmask_b32_e32 v1, 0x3c003c00, v1, vcc
125; GFX1064-NEXT:    global_store_dword v0, v1, s[0:1]
126; GFX1064-NEXT:    s_endpgm
127  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
128  %gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i32 %lid
129  %load = load <2 x half>, ptr addrspace(1) %gep, align 4
130  %elt = extractelement <2 x half> %load, i32 1
131  %cmp = fcmp ugt half %elt, 0.0
132  %sel = select i1 %cmp, <2 x half> <half 1.0, half 1.0>, <2 x half> %load
133  store <2 x half> %sel, ptr addrspace(1) %gep, align 4
134  ret void
135}
136
137define amdgpu_kernel void @test_vopc_class(ptr addrspace(1) %out, float %x) #0 {
138; GFX1032-LABEL: test_vopc_class:
139; GFX1032:       ; %bb.0:
140; GFX1032-NEXT:    s_clause 0x1
141; GFX1032-NEXT:    s_load_dword s2, s[4:5], 0x2c
142; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
143; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
144; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
145; GFX1032-NEXT:    v_cmp_class_f32_e64 s2, s2, 0x204
146; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
147; GFX1032-NEXT:    global_store_dword v0, v1, s[0:1]
148; GFX1032-NEXT:    s_endpgm
149;
150; GFX1064-LABEL: test_vopc_class:
151; GFX1064:       ; %bb.0:
152; GFX1064-NEXT:    s_clause 0x1
153; GFX1064-NEXT:    s_load_dword s2, s[4:5], 0x2c
154; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
155; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
156; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
157; GFX1064-NEXT:    v_cmp_class_f32_e64 s[2:3], s2, 0x204
158; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
159; GFX1064-NEXT:    global_store_dword v0, v1, s[0:1]
160; GFX1064-NEXT:    s_endpgm
161  %fabs = tail call float @llvm.fabs.f32(float %x)
162  %cmp = fcmp oeq float %fabs, 0x7FF0000000000000
163  %ext = zext i1 %cmp to i32
164  store i32 %ext, ptr addrspace(1) %out, align 4
165  ret void
166}
167
168define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0 {
169; GFX1032-LABEL: test_vcmp_vcnd_f16:
170; GFX1032:       ; %bb.0:
171; GFX1032-NEXT:    s_clause 0x1
172; GFX1032-NEXT:    s_load_dword s2, s[4:5], 0x2c
173; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
174; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
175; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
176; GFX1032-NEXT:    v_mov_b32_e32 v0, s2
177; GFX1032-NEXT:    v_cmp_neq_f16_e64 vcc_lo, 0x7c00, s2
178; GFX1032-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v0, vcc_lo
179; GFX1032-NEXT:    global_store_short v1, v0, s[0:1]
180; GFX1032-NEXT:    s_endpgm
181;
182; GFX1064-LABEL: test_vcmp_vcnd_f16:
183; GFX1064:       ; %bb.0:
184; GFX1064-NEXT:    s_clause 0x1
185; GFX1064-NEXT:    s_load_dword s2, s[4:5], 0x2c
186; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
187; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
188; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
189; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
190; GFX1064-NEXT:    v_cmp_neq_f16_e64 vcc, 0x7c00, s2
191; GFX1064-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v0, vcc
192; GFX1064-NEXT:    global_store_short v1, v0, s[0:1]
193; GFX1064-NEXT:    s_endpgm
194  %cmp = fcmp oeq half %x, 0x7FF0000000000000
195  %sel = select i1 %cmp, half 1.0, half %x
196  store half %sel, ptr addrspace(1) %out, align 2
197  ret void
198}
199
200define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) {
201; GFX1032-LABEL: test_vop3_cmp_f32_sop_and:
202; GFX1032:       ; %bb.0:
203; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
204; GFX1032-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
205; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
206; GFX1032-NEXT:    global_load_dword v1, v0, s[2:3]
207; GFX1032-NEXT:    s_waitcnt vmcnt(0)
208; GFX1032-NEXT:    v_cmp_nge_f32_e32 vcc_lo, 0, v1
209; GFX1032-NEXT:    v_cmp_nle_f32_e64 s0, 1.0, v1
210; GFX1032-NEXT:    s_and_b32 s0, vcc_lo, s0
211; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 2.0, 1.0, s0
212; GFX1032-NEXT:    global_store_dword v0, v1, s[2:3]
213; GFX1032-NEXT:    s_endpgm
214;
215; GFX1064-LABEL: test_vop3_cmp_f32_sop_and:
216; GFX1064:       ; %bb.0:
217; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
218; GFX1064-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
219; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
220; GFX1064-NEXT:    global_load_dword v1, v0, s[2:3]
221; GFX1064-NEXT:    s_waitcnt vmcnt(0)
222; GFX1064-NEXT:    v_cmp_nge_f32_e32 vcc, 0, v1
223; GFX1064-NEXT:    v_cmp_nle_f32_e64 s[0:1], 1.0, v1
224; GFX1064-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
225; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 2.0, 1.0, s[0:1]
226; GFX1064-NEXT:    global_store_dword v0, v1, s[2:3]
227; GFX1064-NEXT:    s_endpgm
228  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
229  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %lid
230  %load = load float, ptr addrspace(1) %gep, align 4
231  %cmp = fcmp ugt float %load, 0.0
232  %cmp2 = fcmp ult float %load, 1.0
233  %and = and i1 %cmp, %cmp2
234  %sel = select i1 %and, float 1.0, float 2.0
235  store float %sel, ptr addrspace(1) %gep, align 4
236  ret void
237}
238
239define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) {
240; GFX1032-LABEL: test_vop3_cmp_i32_sop_xor:
241; GFX1032:       ; %bb.0:
242; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
243; GFX1032-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
244; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
245; GFX1032-NEXT:    global_load_dword v1, v0, s[2:3]
246; GFX1032-NEXT:    s_waitcnt vmcnt(0)
247; GFX1032-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0, v1
248; GFX1032-NEXT:    v_cmp_gt_i32_e64 s0, 1, v1
249; GFX1032-NEXT:    s_xor_b32 s0, vcc_lo, s0
250; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 2, 1, s0
251; GFX1032-NEXT:    global_store_dword v0, v1, s[2:3]
252; GFX1032-NEXT:    s_endpgm
253;
254; GFX1064-LABEL: test_vop3_cmp_i32_sop_xor:
255; GFX1064:       ; %bb.0:
256; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
257; GFX1064-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
258; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
259; GFX1064-NEXT:    global_load_dword v1, v0, s[2:3]
260; GFX1064-NEXT:    s_waitcnt vmcnt(0)
261; GFX1064-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v1
262; GFX1064-NEXT:    v_cmp_gt_i32_e64 s[0:1], 1, v1
263; GFX1064-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
264; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 2, 1, s[0:1]
265; GFX1064-NEXT:    global_store_dword v0, v1, s[2:3]
266; GFX1064-NEXT:    s_endpgm
267  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
268  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid
269  %load = load i32, ptr addrspace(1) %gep, align 4
270  %cmp = icmp sgt i32 %load, 0
271  %cmp2 = icmp slt i32 %load, 1
272  %xor = xor i1 %cmp, %cmp2
273  %sel = select i1 %xor, i32 1, i32 2
274  store i32 %sel, ptr addrspace(1) %gep, align 4
275  ret void
276}
277
278define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(ptr addrspace(1) %arg) {
279; GFX1032-LABEL: test_vop3_cmp_u32_sop_or:
280; GFX1032:       ; %bb.0:
281; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
282; GFX1032-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
283; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
284; GFX1032-NEXT:    global_load_dword v1, v0, s[2:3]
285; GFX1032-NEXT:    s_waitcnt vmcnt(0)
286; GFX1032-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 3, v1
287; GFX1032-NEXT:    v_cmp_gt_u32_e64 s0, 2, v1
288; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
289; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 2, 1, s0
290; GFX1032-NEXT:    global_store_dword v0, v1, s[2:3]
291; GFX1032-NEXT:    s_endpgm
292;
293; GFX1064-LABEL: test_vop3_cmp_u32_sop_or:
294; GFX1064:       ; %bb.0:
295; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
296; GFX1064-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
297; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
298; GFX1064-NEXT:    global_load_dword v1, v0, s[2:3]
299; GFX1064-NEXT:    s_waitcnt vmcnt(0)
300; GFX1064-NEXT:    v_cmp_lt_u32_e32 vcc, 3, v1
301; GFX1064-NEXT:    v_cmp_gt_u32_e64 s[0:1], 2, v1
302; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
303; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 2, 1, s[0:1]
304; GFX1064-NEXT:    global_store_dword v0, v1, s[2:3]
305; GFX1064-NEXT:    s_endpgm
306  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
307  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid
308  %load = load i32, ptr addrspace(1) %gep, align 4
309  %cmp = icmp ugt i32 %load, 3
310  %cmp2 = icmp ult i32 %load, 2
311  %or = or i1 %cmp, %cmp2
312  %sel = select i1 %or, i32 1, i32 2
313  store i32 %sel, ptr addrspace(1) %gep, align 4
314  ret void
315}
316
317define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 {
318; GFX1032-LABEL: test_mask_if:
319; GFX1032:       ; %bb.0:
320; GFX1032-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 10, v0
321; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
322; GFX1032-NEXT:    s_cbranch_execz .LBB9_2
323; GFX1032-NEXT:  ; %bb.1: ; %if
324; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
325; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
326; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
327; GFX1032-NEXT:    global_store_dword v0, v0, s[0:1]
328; GFX1032-NEXT:  .LBB9_2: ; %endif
329; GFX1032-NEXT:    s_endpgm
330;
331; GFX1064-LABEL: test_mask_if:
332; GFX1064:       ; %bb.0:
333; GFX1064-NEXT:    v_cmp_lt_u32_e32 vcc, 10, v0
334; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
335; GFX1064-NEXT:    s_cbranch_execz .LBB9_2
336; GFX1064-NEXT:  ; %bb.1: ; %if
337; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
338; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
339; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
340; GFX1064-NEXT:    global_store_dword v0, v0, s[0:1]
341; GFX1064-NEXT:  .LBB9_2: ; %endif
342; GFX1064-NEXT:    s_endpgm
343  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
344  %cmp = icmp ugt i32 %lid, 10
345  br i1 %cmp, label %if, label %endif
346
347if:
348  store i32 0, ptr addrspace(1) %arg, align 4
349  br label %endif
350
351endif:
352  ret void
353}
354
355define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
356; GFX1032-LABEL: test_loop_with_if:
357; GFX1032:       ; %bb.0: ; %bb
358; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
359; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
360; GFX1032-NEXT:    s_mov_b32 s2, 0
361; GFX1032-NEXT:    ; implicit-def: $vgpr2_vgpr3
362; GFX1032-NEXT:    s_branch .LBB10_2
363; GFX1032-NEXT:  .LBB10_1: ; %bb13
364; GFX1032-NEXT:    ; in Loop: Header=BB10_2 Depth=1
365; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
366; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
367; GFX1032-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0xfe, v4
368; GFX1032-NEXT:    v_add_nc_u32_e32 v1, 1, v4
369; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
370; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
371; GFX1032-NEXT:    s_cbranch_execz .LBB10_8
372; GFX1032-NEXT:  .LBB10_2: ; %bb2
373; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
374; GFX1032-NEXT:    v_cmp_ge_i32_e64 s4, v1, v0
375; GFX1032-NEXT:    s_mov_b32 s3, 0
376; GFX1032-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v1, v0
377; GFX1032-NEXT:    s_and_saveexec_b32 s5, vcc_lo
378; GFX1032-NEXT:    s_cbranch_execz .LBB10_4
379; GFX1032-NEXT:  ; %bb.3: ; %bb5
380; GFX1032-NEXT:    ; in Loop: Header=BB10_2 Depth=1
381; GFX1032-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
382; GFX1032-NEXT:    s_andn2_b32 s4, s4, exec_lo
383; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
384; GFX1032-NEXT:    v_lshlrev_b64 v[2:3], 2, v[1:2]
385; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
386; GFX1032-NEXT:    v_add_co_u32 v2, vcc_lo, s0, v2
387; GFX1032-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo
388; GFX1032-NEXT:    global_load_dword v4, v[2:3], off
389; GFX1032-NEXT:    s_waitcnt vmcnt(0)
390; GFX1032-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 11, v4
391; GFX1032-NEXT:    s_and_b32 s6, vcc_lo, exec_lo
392; GFX1032-NEXT:    s_or_b32 s4, s4, s6
393; GFX1032-NEXT:  .LBB10_4: ; %Flow
394; GFX1032-NEXT:    ; in Loop: Header=BB10_2 Depth=1
395; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s5
396; GFX1032-NEXT:    ; implicit-def: $vgpr4
397; GFX1032-NEXT:    s_and_saveexec_b32 s5, s4
398; GFX1032-NEXT:    s_xor_b32 s4, exec_lo, s5
399; GFX1032-NEXT:  ; %bb.5: ; %bb11
400; GFX1032-NEXT:    ; in Loop: Header=BB10_2 Depth=1
401; GFX1032-NEXT:    v_lshrrev_b32_e32 v4, 31, v1
402; GFX1032-NEXT:    s_andn2_b32 s3, s3, exec_lo
403; GFX1032-NEXT:    v_add_nc_u32_e32 v4, v1, v4
404; GFX1032-NEXT:    v_ashrrev_i32_e32 v4, 1, v4
405; GFX1032-NEXT:  ; %bb.6: ; %Flow1
406; GFX1032-NEXT:    ; in Loop: Header=BB10_2 Depth=1
407; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
408; GFX1032-NEXT:    s_and_saveexec_b32 s4, s3
409; GFX1032-NEXT:    s_cbranch_execz .LBB10_1
410; GFX1032-NEXT:  ; %bb.7: ; %bb10
411; GFX1032-NEXT:    ; in Loop: Header=BB10_2 Depth=1
412; GFX1032-NEXT:    v_mov_b32_e32 v4, v1
413; GFX1032-NEXT:    global_store_dword v[2:3], v0, off
414; GFX1032-NEXT:    s_branch .LBB10_1
415; GFX1032-NEXT:  .LBB10_8: ; %bb1
416; GFX1032-NEXT:    s_endpgm
417;
418; GFX1064-LABEL: test_loop_with_if:
419; GFX1064:       ; %bb.0: ; %bb
420; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
421; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
422; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
423; GFX1064-NEXT:    ; implicit-def: $vgpr2_vgpr3
424; GFX1064-NEXT:    s_branch .LBB10_2
425; GFX1064-NEXT:  .LBB10_1: ; %bb13
426; GFX1064-NEXT:    ; in Loop: Header=BB10_2 Depth=1
427; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
428; GFX1064-NEXT:    s_or_b64 exec, exec, s[6:7]
429; GFX1064-NEXT:    v_cmp_lt_i32_e32 vcc, 0xfe, v4
430; GFX1064-NEXT:    v_add_nc_u32_e32 v1, 1, v4
431; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
432; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
433; GFX1064-NEXT:    s_cbranch_execz .LBB10_8
434; GFX1064-NEXT:  .LBB10_2: ; %bb2
435; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
436; GFX1064-NEXT:    v_cmp_ge_i32_e64 s[6:7], v1, v0
437; GFX1064-NEXT:    v_cmp_lt_i32_e32 vcc, v1, v0
438; GFX1064-NEXT:    s_mov_b64 s[4:5], 0
439; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], vcc
440; GFX1064-NEXT:    s_cbranch_execz .LBB10_4
441; GFX1064-NEXT:  ; %bb.3: ; %bb5
442; GFX1064-NEXT:    ; in Loop: Header=BB10_2 Depth=1
443; GFX1064-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
444; GFX1064-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
445; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
446; GFX1064-NEXT:    v_lshlrev_b64 v[2:3], 2, v[1:2]
447; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
448; GFX1064-NEXT:    v_add_co_u32 v2, vcc, s0, v2
449; GFX1064-NEXT:    v_add_co_ci_u32_e32 v3, vcc, s1, v3, vcc
450; GFX1064-NEXT:    global_load_dword v4, v[2:3], off
451; GFX1064-NEXT:    s_waitcnt vmcnt(0)
452; GFX1064-NEXT:    v_cmp_gt_i32_e32 vcc, 11, v4
453; GFX1064-NEXT:    s_and_b64 s[10:11], vcc, exec
454; GFX1064-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
455; GFX1064-NEXT:  .LBB10_4: ; %Flow
456; GFX1064-NEXT:    ; in Loop: Header=BB10_2 Depth=1
457; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
458; GFX1064-NEXT:    ; implicit-def: $vgpr4
459; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
460; GFX1064-NEXT:    s_xor_b64 s[6:7], exec, s[8:9]
461; GFX1064-NEXT:  ; %bb.5: ; %bb11
462; GFX1064-NEXT:    ; in Loop: Header=BB10_2 Depth=1
463; GFX1064-NEXT:    v_lshrrev_b32_e32 v4, 31, v1
464; GFX1064-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
465; GFX1064-NEXT:    v_add_nc_u32_e32 v4, v1, v4
466; GFX1064-NEXT:    v_ashrrev_i32_e32 v4, 1, v4
467; GFX1064-NEXT:  ; %bb.6: ; %Flow1
468; GFX1064-NEXT:    ; in Loop: Header=BB10_2 Depth=1
469; GFX1064-NEXT:    s_or_b64 exec, exec, s[6:7]
470; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
471; GFX1064-NEXT:    s_cbranch_execz .LBB10_1
472; GFX1064-NEXT:  ; %bb.7: ; %bb10
473; GFX1064-NEXT:    ; in Loop: Header=BB10_2 Depth=1
474; GFX1064-NEXT:    v_mov_b32_e32 v4, v1
475; GFX1064-NEXT:    global_store_dword v[2:3], v0, off
476; GFX1064-NEXT:    s_branch .LBB10_1
477; GFX1064-NEXT:  .LBB10_8: ; %bb1
478; GFX1064-NEXT:    s_endpgm
479bb:
480  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
481  br label %bb2
482
483bb1:
484  ret void
485
486bb2:
487  %tmp3 = phi i32 [ 0, %bb ], [ %tmp15, %bb13 ]
488  %tmp4 = icmp slt i32 %tmp3, %tmp
489  br i1 %tmp4, label %bb5, label %bb11
490
491bb5:
492  %tmp6 = sext i32 %tmp3 to i64
493  %tmp7 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp6
494  %tmp8 = load i32, ptr addrspace(1) %tmp7, align 4
495  %tmp9 = icmp sgt i32 %tmp8, 10
496  br i1 %tmp9, label %bb10, label %bb11
497
498bb10:
499  store i32 %tmp, ptr addrspace(1) %tmp7, align 4
500  br label %bb13
501
502bb11:
503  %tmp12 = sdiv i32 %tmp3, 2
504  br label %bb13
505
506bb13:
507  %tmp14 = phi i32 [ %tmp3, %bb10 ], [ %tmp12, %bb11 ]
508  %tmp15 = add nsw i32 %tmp14, 1
509  %tmp16 = icmp slt i32 %tmp14, 255
510  br i1 %tmp16, label %bb2, label %bb1
511}
512
513
514
515define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) #0 {
516; GFX1032-LABEL: test_loop_with_if_else_break:
517; GFX1032:       ; %bb.0: ; %bb
518; GFX1032-NEXT:    s_mov_b32 s2, 0
519; GFX1032-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
520; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
521; GFX1032-NEXT:    s_cbranch_execz .LBB11_6
522; GFX1032-NEXT:  ; %bb.1: ; %.preheader
523; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
524; GFX1032-NEXT:    v_min_u32_e32 v1, 0x100, v0
525; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
526; GFX1032-NEXT:    s_mov_b32 s3, 0
527; GFX1032-NEXT:    ; implicit-def: $sgpr4
528; GFX1032-NEXT:    s_branch .LBB11_4
529; GFX1032-NEXT:  .LBB11_2: ; %bb8
530; GFX1032-NEXT:    ; in Loop: Header=BB11_4 Depth=1
531; GFX1032-NEXT:    s_add_i32 s3, s3, 1
532; GFX1032-NEXT:    global_store_dword v2, v0, s[0:1]
533; GFX1032-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s3, v1
534; GFX1032-NEXT:    s_add_u32 s0, s0, 4
535; GFX1032-NEXT:    s_addc_u32 s1, s1, 0
536; GFX1032-NEXT:    s_andn2_b32 s4, s4, exec_lo
537; GFX1032-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
538; GFX1032-NEXT:    s_or_b32 s4, s4, s5
539; GFX1032-NEXT:  .LBB11_3: ; %Flow
540; GFX1032-NEXT:    ; in Loop: Header=BB11_4 Depth=1
541; GFX1032-NEXT:    s_and_b32 s5, exec_lo, s4
542; GFX1032-NEXT:    s_or_b32 s2, s5, s2
543; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
544; GFX1032-NEXT:    s_cbranch_execz .LBB11_6
545; GFX1032-NEXT:  .LBB11_4: ; %bb2
546; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
547; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
548; GFX1032-NEXT:    global_load_dword v3, v2, s[0:1]
549; GFX1032-NEXT:    s_or_b32 s4, s4, exec_lo
550; GFX1032-NEXT:    s_waitcnt vmcnt(0)
551; GFX1032-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 11, v3
552; GFX1032-NEXT:    s_cbranch_vccz .LBB11_2
553; GFX1032-NEXT:  ; %bb.5: ; in Loop: Header=BB11_4 Depth=1
554; GFX1032-NEXT:    ; implicit-def: $sgpr3
555; GFX1032-NEXT:    ; implicit-def: $sgpr0_sgpr1
556; GFX1032-NEXT:    s_branch .LBB11_3
557; GFX1032-NEXT:  .LBB11_6: ; %.loopexit
558; GFX1032-NEXT:    s_endpgm
559;
560; GFX1064-LABEL: test_loop_with_if_else_break:
561; GFX1064:       ; %bb.0: ; %bb
562; GFX1064-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
563; GFX1064-NEXT:    s_mov_b32 s6, 0
564; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
565; GFX1064-NEXT:    s_cbranch_execz .LBB11_6
566; GFX1064-NEXT:  ; %bb.1: ; %.preheader
567; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
568; GFX1064-NEXT:    v_min_u32_e32 v1, 0x100, v0
569; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
570; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
571; GFX1064-NEXT:    ; implicit-def: $sgpr4_sgpr5
572; GFX1064-NEXT:    s_branch .LBB11_4
573; GFX1064-NEXT:  .LBB11_2: ; %bb8
574; GFX1064-NEXT:    ; in Loop: Header=BB11_4 Depth=1
575; GFX1064-NEXT:    s_add_i32 s6, s6, 1
576; GFX1064-NEXT:    global_store_dword v2, v0, s[0:1]
577; GFX1064-NEXT:    v_cmp_ge_u32_e32 vcc, s6, v1
578; GFX1064-NEXT:    s_add_u32 s0, s0, 4
579; GFX1064-NEXT:    s_addc_u32 s1, s1, 0
580; GFX1064-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
581; GFX1064-NEXT:    s_and_b64 s[8:9], vcc, exec
582; GFX1064-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
583; GFX1064-NEXT:  .LBB11_3: ; %Flow
584; GFX1064-NEXT:    ; in Loop: Header=BB11_4 Depth=1
585; GFX1064-NEXT:    s_and_b64 s[8:9], exec, s[4:5]
586; GFX1064-NEXT:    s_or_b64 s[2:3], s[8:9], s[2:3]
587; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
588; GFX1064-NEXT:    s_cbranch_execz .LBB11_6
589; GFX1064-NEXT:  .LBB11_4: ; %bb2
590; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
591; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
592; GFX1064-NEXT:    global_load_dword v3, v2, s[0:1]
593; GFX1064-NEXT:    s_or_b64 s[4:5], s[4:5], exec
594; GFX1064-NEXT:    s_waitcnt vmcnt(0)
595; GFX1064-NEXT:    v_cmp_gt_i32_e32 vcc, 11, v3
596; GFX1064-NEXT:    s_cbranch_vccz .LBB11_2
597; GFX1064-NEXT:  ; %bb.5: ; in Loop: Header=BB11_4 Depth=1
598; GFX1064-NEXT:    ; implicit-def: $sgpr6
599; GFX1064-NEXT:    ; implicit-def: $sgpr0_sgpr1
600; GFX1064-NEXT:    s_branch .LBB11_3
601; GFX1064-NEXT:  .LBB11_6: ; %.loopexit
602; GFX1064-NEXT:    s_endpgm
603bb:
604  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
605  %tmp1 = icmp eq i32 %tmp, 0
606  br i1 %tmp1, label %.loopexit, label %.preheader
607
608.preheader:
609  br label %bb2
610
611bb2:
612  %tmp3 = phi i32 [ %tmp9, %bb8 ], [ 0, %.preheader ]
613  %tmp4 = zext i32 %tmp3 to i64
614  %tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp4
615  %tmp6 = load i32, ptr addrspace(1) %tmp5, align 4
616  %tmp7 = icmp sgt i32 %tmp6, 10
617  br i1 %tmp7, label %bb8, label %.loopexit
618
619bb8:
620  store i32 %tmp, ptr addrspace(1) %tmp5, align 4
621  %tmp9 = add nuw nsw i32 %tmp3, 1
622  %tmp10 = icmp ult i32 %tmp9, 256
623  %tmp11 = icmp ult i32 %tmp9, %tmp
624  %tmp12 = and i1 %tmp10, %tmp11
625  br i1 %tmp12, label %bb2, label %.loopexit
626
627.loopexit:
628  ret void
629}
630
631define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
632; GFX1032-LABEL: test_addc_vop2b:
633; GFX1032:       ; %bb.0: ; %bb
634; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
635; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
636; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
637; GFX1032-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
638; GFX1032-NEXT:    s_waitcnt vmcnt(0)
639; GFX1032-NEXT:    v_add_co_u32 v0, vcc_lo, v0, s2
640; GFX1032-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
641; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
642; GFX1032-NEXT:    s_endpgm
643;
644; GFX1064-LABEL: test_addc_vop2b:
645; GFX1064:       ; %bb.0: ; %bb
646; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
647; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
648; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
649; GFX1064-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
650; GFX1064-NEXT:    s_waitcnt vmcnt(0)
651; GFX1064-NEXT:    v_add_co_u32 v0, vcc, v0, s2
652; GFX1064-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
653; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
654; GFX1064-NEXT:    s_endpgm
655bb:
656  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
657  %tmp3 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp
658  %tmp4 = load i64, ptr addrspace(1) %tmp3, align 8
659  %tmp5 = add nsw i64 %tmp4, %arg1
660  store i64 %tmp5, ptr addrspace(1) %tmp3, align 8
661  ret void
662}
663
664define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
665; GFX1032-LABEL: test_subbrev_vop2b:
666; GFX1032:       ; %bb.0: ; %bb
667; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
668; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
669; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
670; GFX1032-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
671; GFX1032-NEXT:    s_waitcnt vmcnt(0)
672; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s2
673; GFX1032-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
674; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
675; GFX1032-NEXT:    s_endpgm
676;
677; GFX1064-LABEL: test_subbrev_vop2b:
678; GFX1064:       ; %bb.0: ; %bb
679; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
680; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
681; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
682; GFX1064-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
683; GFX1064-NEXT:    s_waitcnt vmcnt(0)
684; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, v0, s2
685; GFX1064-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc, s3, v1, vcc
686; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
687; GFX1064-NEXT:    s_endpgm
688bb:
689  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
690  %tmp3 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp
691  %tmp4 = load i64, ptr addrspace(1) %tmp3, align 8
692  %tmp5 = sub nsw i64 %tmp4, %arg1
693  store i64 %tmp5, ptr addrspace(1) %tmp3, align 8
694  ret void
695}
696
697define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
698; GFX1032-LABEL: test_subb_vop2b:
699; GFX1032:       ; %bb.0: ; %bb
700; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
701; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
702; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
703; GFX1032-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
704; GFX1032-NEXT:    s_waitcnt vmcnt(0)
705; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
706; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
707; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
708; GFX1032-NEXT:    s_endpgm
709;
710; GFX1064-LABEL: test_subb_vop2b:
711; GFX1064:       ; %bb.0: ; %bb
712; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
713; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
714; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
715; GFX1064-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
716; GFX1064-NEXT:    s_waitcnt vmcnt(0)
717; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
718; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
719; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
720; GFX1064-NEXT:    s_endpgm
721bb:
722  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
723  %tmp3 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp
724  %tmp4 = load i64, ptr addrspace(1) %tmp3, align 8
725  %tmp5 = sub nsw i64 %arg1, %tmp4
726  store i64 %tmp5, ptr addrspace(1) %tmp3, align 8
727  ret void
728}
729
730define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
731; GFX1032-LABEL: test_udiv64:
732; GFX1032:       ; %bb.0: ; %bb
733; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
734; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
735; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
736; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
737; GFX1032-NEXT:    s_or_b64 s[8:9], s[6:7], s[4:5]
738; GFX1032-NEXT:    s_mov_b32 s8, 0
739; GFX1032-NEXT:    s_cmp_lg_u64 s[8:9], 0
740; GFX1032-NEXT:    s_cbranch_scc0 .LBB15_4
741; GFX1032-NEXT:  ; %bb.1:
742; GFX1032-NEXT:    v_cvt_f32_u32_e32 v0, s4
743; GFX1032-NEXT:    v_cvt_f32_u32_e32 v1, s5
744; GFX1032-NEXT:    s_sub_u32 s9, 0, s4
745; GFX1032-NEXT:    s_subb_u32 s10, 0, s5
746; GFX1032-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
747; GFX1032-NEXT:    v_rcp_f32_e32 v0, v0
748; GFX1032-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
749; GFX1032-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
750; GFX1032-NEXT:    v_trunc_f32_e32 v1, v1
751; GFX1032-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
752; GFX1032-NEXT:    v_cvt_u32_f32_e32 v1, v1
753; GFX1032-NEXT:    v_cvt_u32_f32_e32 v0, v0
754; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
755; GFX1032-NEXT:    v_readfirstlane_b32 s1, v0
756; GFX1032-NEXT:    s_mul_i32 s11, s9, s0
757; GFX1032-NEXT:    s_mul_hi_u32 s13, s9, s1
758; GFX1032-NEXT:    s_mul_i32 s12, s10, s1
759; GFX1032-NEXT:    s_add_i32 s11, s13, s11
760; GFX1032-NEXT:    s_mul_i32 s14, s9, s1
761; GFX1032-NEXT:    s_add_i32 s11, s11, s12
762; GFX1032-NEXT:    s_mul_hi_u32 s13, s1, s14
763; GFX1032-NEXT:    s_mul_hi_u32 s15, s0, s14
764; GFX1032-NEXT:    s_mul_i32 s12, s0, s14
765; GFX1032-NEXT:    s_mul_hi_u32 s14, s1, s11
766; GFX1032-NEXT:    s_mul_i32 s1, s1, s11
767; GFX1032-NEXT:    s_mul_hi_u32 s16, s0, s11
768; GFX1032-NEXT:    s_add_u32 s1, s13, s1
769; GFX1032-NEXT:    s_addc_u32 s13, 0, s14
770; GFX1032-NEXT:    s_add_u32 s1, s1, s12
771; GFX1032-NEXT:    s_mul_i32 s11, s0, s11
772; GFX1032-NEXT:    s_addc_u32 s1, s13, s15
773; GFX1032-NEXT:    s_addc_u32 s12, s16, 0
774; GFX1032-NEXT:    s_add_u32 s1, s1, s11
775; GFX1032-NEXT:    s_addc_u32 s11, 0, s12
776; GFX1032-NEXT:    v_add_co_u32 v0, s1, v0, s1
777; GFX1032-NEXT:    s_cmp_lg_u32 s1, 0
778; GFX1032-NEXT:    s_addc_u32 s0, s0, s11
779; GFX1032-NEXT:    v_readfirstlane_b32 s1, v0
780; GFX1032-NEXT:    s_mul_i32 s11, s9, s0
781; GFX1032-NEXT:    s_mul_hi_u32 s12, s9, s1
782; GFX1032-NEXT:    s_mul_i32 s10, s10, s1
783; GFX1032-NEXT:    s_add_i32 s11, s12, s11
784; GFX1032-NEXT:    s_mul_i32 s9, s9, s1
785; GFX1032-NEXT:    s_add_i32 s11, s11, s10
786; GFX1032-NEXT:    s_mul_hi_u32 s12, s0, s9
787; GFX1032-NEXT:    s_mul_i32 s13, s0, s9
788; GFX1032-NEXT:    s_mul_hi_u32 s9, s1, s9
789; GFX1032-NEXT:    s_mul_hi_u32 s14, s1, s11
790; GFX1032-NEXT:    s_mul_i32 s1, s1, s11
791; GFX1032-NEXT:    s_mul_hi_u32 s10, s0, s11
792; GFX1032-NEXT:    s_add_u32 s1, s9, s1
793; GFX1032-NEXT:    s_addc_u32 s9, 0, s14
794; GFX1032-NEXT:    s_add_u32 s1, s1, s13
795; GFX1032-NEXT:    s_mul_i32 s11, s0, s11
796; GFX1032-NEXT:    s_addc_u32 s1, s9, s12
797; GFX1032-NEXT:    s_addc_u32 s9, s10, 0
798; GFX1032-NEXT:    s_add_u32 s1, s1, s11
799; GFX1032-NEXT:    s_addc_u32 s9, 0, s9
800; GFX1032-NEXT:    v_add_co_u32 v0, s1, v0, s1
801; GFX1032-NEXT:    s_cmp_lg_u32 s1, 0
802; GFX1032-NEXT:    s_addc_u32 s0, s0, s9
803; GFX1032-NEXT:    v_readfirstlane_b32 s1, v0
804; GFX1032-NEXT:    s_mul_i32 s10, s6, s0
805; GFX1032-NEXT:    s_mul_hi_u32 s9, s6, s0
806; GFX1032-NEXT:    s_mul_hi_u32 s11, s7, s0
807; GFX1032-NEXT:    s_mul_i32 s0, s7, s0
808; GFX1032-NEXT:    s_mul_hi_u32 s12, s6, s1
809; GFX1032-NEXT:    s_mul_hi_u32 s13, s7, s1
810; GFX1032-NEXT:    s_mul_i32 s1, s7, s1
811; GFX1032-NEXT:    s_add_u32 s10, s12, s10
812; GFX1032-NEXT:    s_addc_u32 s9, 0, s9
813; GFX1032-NEXT:    s_add_u32 s1, s10, s1
814; GFX1032-NEXT:    s_addc_u32 s1, s9, s13
815; GFX1032-NEXT:    s_addc_u32 s9, s11, 0
816; GFX1032-NEXT:    s_add_u32 s1, s1, s0
817; GFX1032-NEXT:    s_addc_u32 s9, 0, s9
818; GFX1032-NEXT:    s_mul_hi_u32 s0, s4, s1
819; GFX1032-NEXT:    s_mul_i32 s11, s4, s9
820; GFX1032-NEXT:    s_mul_i32 s12, s4, s1
821; GFX1032-NEXT:    s_add_i32 s0, s0, s11
822; GFX1032-NEXT:    v_sub_co_u32 v0, s11, s6, s12
823; GFX1032-NEXT:    s_mul_i32 s10, s5, s1
824; GFX1032-NEXT:    s_add_i32 s0, s0, s10
825; GFX1032-NEXT:    v_sub_co_u32 v1, s12, v0, s4
826; GFX1032-NEXT:    s_sub_i32 s10, s7, s0
827; GFX1032-NEXT:    s_cmp_lg_u32 s11, 0
828; GFX1032-NEXT:    s_subb_u32 s10, s10, s5
829; GFX1032-NEXT:    s_cmp_lg_u32 s12, 0
830; GFX1032-NEXT:    v_cmp_le_u32_e32 vcc_lo, s4, v1
831; GFX1032-NEXT:    s_subb_u32 s10, s10, 0
832; GFX1032-NEXT:    s_cmp_ge_u32 s10, s5
833; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
834; GFX1032-NEXT:    s_cselect_b32 s12, -1, 0
835; GFX1032-NEXT:    s_cmp_eq_u32 s10, s5
836; GFX1032-NEXT:    s_cselect_b32 vcc_lo, -1, 0
837; GFX1032-NEXT:    s_add_u32 s10, s1, 1
838; GFX1032-NEXT:    v_cndmask_b32_e32 v1, s12, v1, vcc_lo
839; GFX1032-NEXT:    s_addc_u32 s12, s9, 0
840; GFX1032-NEXT:    s_add_u32 s13, s1, 2
841; GFX1032-NEXT:    s_addc_u32 s14, s9, 0
842; GFX1032-NEXT:    s_cmp_lg_u32 s11, 0
843; GFX1032-NEXT:    v_cmp_le_u32_e32 vcc_lo, s4, v0
844; GFX1032-NEXT:    s_subb_u32 s0, s7, s0
845; GFX1032-NEXT:    v_mov_b32_e32 v2, s13
846; GFX1032-NEXT:    s_cmp_ge_u32 s0, s5
847; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
848; GFX1032-NEXT:    s_cselect_b32 s7, -1, 0
849; GFX1032-NEXT:    s_cmp_eq_u32 s0, s5
850; GFX1032-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
851; GFX1032-NEXT:    s_cselect_b32 s0, -1, 0
852; GFX1032-NEXT:    v_mov_b32_e32 v1, s14
853; GFX1032-NEXT:    v_cndmask_b32_e64 v0, s7, v0, s0
854; GFX1032-NEXT:    v_cndmask_b32_e32 v2, s10, v2, vcc_lo
855; GFX1032-NEXT:    v_cndmask_b32_e32 v1, s12, v1, vcc_lo
856; GFX1032-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
857; GFX1032-NEXT:    v_cndmask_b32_e32 v1, s9, v1, vcc_lo
858; GFX1032-NEXT:    v_cndmask_b32_e32 v0, s1, v2, vcc_lo
859; GFX1032-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s8
860; GFX1032-NEXT:    s_cbranch_vccnz .LBB15_3
861; GFX1032-NEXT:  .LBB15_2:
862; GFX1032-NEXT:    v_cvt_f32_u32_e32 v0, s4
863; GFX1032-NEXT:    s_sub_i32 s1, 0, s4
864; GFX1032-NEXT:    v_rcp_iflag_f32_e32 v0, v0
865; GFX1032-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
866; GFX1032-NEXT:    v_cvt_u32_f32_e32 v0, v0
867; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
868; GFX1032-NEXT:    s_mul_i32 s1, s1, s0
869; GFX1032-NEXT:    s_mul_hi_u32 s1, s0, s1
870; GFX1032-NEXT:    s_add_i32 s0, s0, s1
871; GFX1032-NEXT:    s_mul_hi_u32 s0, s6, s0
872; GFX1032-NEXT:    s_mul_i32 s1, s0, s4
873; GFX1032-NEXT:    s_add_i32 s5, s0, 1
874; GFX1032-NEXT:    s_sub_i32 s1, s6, s1
875; GFX1032-NEXT:    s_sub_i32 s6, s1, s4
876; GFX1032-NEXT:    s_cmp_ge_u32 s1, s4
877; GFX1032-NEXT:    s_cselect_b32 s0, s5, s0
878; GFX1032-NEXT:    s_cselect_b32 s1, s6, s1
879; GFX1032-NEXT:    s_add_i32 s5, s0, 1
880; GFX1032-NEXT:    s_cmp_ge_u32 s1, s4
881; GFX1032-NEXT:    s_mov_b32 s1, 0
882; GFX1032-NEXT:    s_cselect_b32 s0, s5, s0
883; GFX1032-NEXT:    v_mov_b32_e32 v0, s0
884; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
885; GFX1032-NEXT:  .LBB15_3:
886; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
887; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] offset:16
888; GFX1032-NEXT:    s_endpgm
889; GFX1032-NEXT:  .LBB15_4:
890; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
891; GFX1032-NEXT:    s_branch .LBB15_2
892;
893; GFX1064-LABEL: test_udiv64:
894; GFX1064:       ; %bb.0: ; %bb
895; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
896; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
897; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
898; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
899; GFX1064-NEXT:    s_or_b64 s[0:1], s[6:7], s[4:5]
900; GFX1064-NEXT:    s_mov_b32 s0, 0
901; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
902; GFX1064-NEXT:    s_cbranch_scc0 .LBB15_4
903; GFX1064-NEXT:  ; %bb.1:
904; GFX1064-NEXT:    v_cvt_f32_u32_e32 v0, s4
905; GFX1064-NEXT:    v_cvt_f32_u32_e32 v1, s5
906; GFX1064-NEXT:    s_sub_u32 s9, 0, s4
907; GFX1064-NEXT:    s_subb_u32 s10, 0, s5
908; GFX1064-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
909; GFX1064-NEXT:    v_rcp_f32_e32 v0, v0
910; GFX1064-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
911; GFX1064-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
912; GFX1064-NEXT:    v_trunc_f32_e32 v1, v1
913; GFX1064-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
914; GFX1064-NEXT:    v_cvt_u32_f32_e32 v1, v1
915; GFX1064-NEXT:    v_cvt_u32_f32_e32 v0, v0
916; GFX1064-NEXT:    v_readfirstlane_b32 s8, v1
917; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
918; GFX1064-NEXT:    s_mul_i32 s1, s9, s8
919; GFX1064-NEXT:    s_mul_hi_u32 s12, s9, s0
920; GFX1064-NEXT:    s_mul_i32 s11, s10, s0
921; GFX1064-NEXT:    s_add_i32 s1, s12, s1
922; GFX1064-NEXT:    s_mul_i32 s13, s9, s0
923; GFX1064-NEXT:    s_add_i32 s1, s1, s11
924; GFX1064-NEXT:    s_mul_hi_u32 s12, s0, s13
925; GFX1064-NEXT:    s_mul_hi_u32 s14, s8, s13
926; GFX1064-NEXT:    s_mul_i32 s11, s8, s13
927; GFX1064-NEXT:    s_mul_hi_u32 s13, s0, s1
928; GFX1064-NEXT:    s_mul_i32 s0, s0, s1
929; GFX1064-NEXT:    s_mul_hi_u32 s15, s8, s1
930; GFX1064-NEXT:    s_add_u32 s0, s12, s0
931; GFX1064-NEXT:    s_addc_u32 s12, 0, s13
932; GFX1064-NEXT:    s_add_u32 s0, s0, s11
933; GFX1064-NEXT:    s_mul_i32 s1, s8, s1
934; GFX1064-NEXT:    s_addc_u32 s0, s12, s14
935; GFX1064-NEXT:    s_addc_u32 s11, s15, 0
936; GFX1064-NEXT:    s_add_u32 s0, s0, s1
937; GFX1064-NEXT:    s_addc_u32 s11, 0, s11
938; GFX1064-NEXT:    v_add_co_u32 v0, s[0:1], v0, s0
939; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
940; GFX1064-NEXT:    s_addc_u32 s8, s8, s11
941; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
942; GFX1064-NEXT:    s_mul_i32 s1, s9, s8
943; GFX1064-NEXT:    s_mul_hi_u32 s11, s9, s0
944; GFX1064-NEXT:    s_mul_i32 s10, s10, s0
945; GFX1064-NEXT:    s_add_i32 s1, s11, s1
946; GFX1064-NEXT:    s_mul_i32 s9, s9, s0
947; GFX1064-NEXT:    s_add_i32 s1, s1, s10
948; GFX1064-NEXT:    s_mul_hi_u32 s11, s8, s9
949; GFX1064-NEXT:    s_mul_i32 s12, s8, s9
950; GFX1064-NEXT:    s_mul_hi_u32 s9, s0, s9
951; GFX1064-NEXT:    s_mul_hi_u32 s13, s0, s1
952; GFX1064-NEXT:    s_mul_i32 s0, s0, s1
953; GFX1064-NEXT:    s_mul_hi_u32 s10, s8, s1
954; GFX1064-NEXT:    s_add_u32 s0, s9, s0
955; GFX1064-NEXT:    s_addc_u32 s9, 0, s13
956; GFX1064-NEXT:    s_add_u32 s0, s0, s12
957; GFX1064-NEXT:    s_mul_i32 s1, s8, s1
958; GFX1064-NEXT:    s_addc_u32 s0, s9, s11
959; GFX1064-NEXT:    s_addc_u32 s9, s10, 0
960; GFX1064-NEXT:    s_add_u32 s0, s0, s1
961; GFX1064-NEXT:    s_addc_u32 s9, 0, s9
962; GFX1064-NEXT:    v_add_co_u32 v0, s[0:1], v0, s0
963; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
964; GFX1064-NEXT:    s_addc_u32 s0, s8, s9
965; GFX1064-NEXT:    v_readfirstlane_b32 s1, v0
966; GFX1064-NEXT:    s_mul_i32 s9, s6, s0
967; GFX1064-NEXT:    s_mul_hi_u32 s8, s6, s0
968; GFX1064-NEXT:    s_mul_hi_u32 s10, s7, s0
969; GFX1064-NEXT:    s_mul_i32 s0, s7, s0
970; GFX1064-NEXT:    s_mul_hi_u32 s11, s6, s1
971; GFX1064-NEXT:    s_mul_hi_u32 s12, s7, s1
972; GFX1064-NEXT:    s_mul_i32 s1, s7, s1
973; GFX1064-NEXT:    s_add_u32 s9, s11, s9
974; GFX1064-NEXT:    s_addc_u32 s8, 0, s8
975; GFX1064-NEXT:    s_add_u32 s1, s9, s1
976; GFX1064-NEXT:    s_addc_u32 s1, s8, s12
977; GFX1064-NEXT:    s_addc_u32 s8, s10, 0
978; GFX1064-NEXT:    s_add_u32 s10, s1, s0
979; GFX1064-NEXT:    s_addc_u32 s11, 0, s8
980; GFX1064-NEXT:    s_mul_hi_u32 s0, s4, s10
981; GFX1064-NEXT:    s_mul_i32 s1, s4, s11
982; GFX1064-NEXT:    s_mul_i32 s9, s4, s10
983; GFX1064-NEXT:    s_add_i32 s12, s0, s1
984; GFX1064-NEXT:    v_sub_co_u32 v0, s[0:1], s6, s9
985; GFX1064-NEXT:    s_mul_i32 s8, s5, s10
986; GFX1064-NEXT:    s_add_i32 s12, s12, s8
987; GFX1064-NEXT:    v_sub_co_u32 v1, s[8:9], v0, s4
988; GFX1064-NEXT:    s_sub_i32 s13, s7, s12
989; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
990; GFX1064-NEXT:    s_subb_u32 s13, s13, s5
991; GFX1064-NEXT:    s_cmp_lg_u64 s[8:9], 0
992; GFX1064-NEXT:    v_cmp_le_u32_e32 vcc, s4, v1
993; GFX1064-NEXT:    s_subb_u32 s8, s13, 0
994; GFX1064-NEXT:    s_cmp_ge_u32 s8, s5
995; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
996; GFX1064-NEXT:    s_cselect_b32 s9, -1, 0
997; GFX1064-NEXT:    s_cmp_eq_u32 s8, s5
998; GFX1064-NEXT:    s_cselect_b64 vcc, -1, 0
999; GFX1064-NEXT:    s_add_u32 s8, s10, 1
1000; GFX1064-NEXT:    v_cndmask_b32_e32 v1, s9, v1, vcc
1001; GFX1064-NEXT:    s_addc_u32 s9, s11, 0
1002; GFX1064-NEXT:    s_add_u32 s13, s10, 2
1003; GFX1064-NEXT:    s_addc_u32 s14, s11, 0
1004; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
1005; GFX1064-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
1006; GFX1064-NEXT:    s_subb_u32 s0, s7, s12
1007; GFX1064-NEXT:    v_mov_b32_e32 v2, s13
1008; GFX1064-NEXT:    s_cmp_ge_u32 s0, s5
1009; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
1010; GFX1064-NEXT:    s_cselect_b32 s7, -1, 0
1011; GFX1064-NEXT:    s_cmp_eq_u32 s0, s5
1012; GFX1064-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
1013; GFX1064-NEXT:    s_cselect_b64 s[0:1], -1, 0
1014; GFX1064-NEXT:    v_mov_b32_e32 v1, s14
1015; GFX1064-NEXT:    v_cndmask_b32_e64 v0, s7, v0, s[0:1]
1016; GFX1064-NEXT:    v_cndmask_b32_e32 v2, s8, v2, vcc
1017; GFX1064-NEXT:    v_cndmask_b32_e32 v1, s9, v1, vcc
1018; GFX1064-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1019; GFX1064-NEXT:    v_cndmask_b32_e32 v1, s11, v1, vcc
1020; GFX1064-NEXT:    v_cndmask_b32_e32 v0, s10, v2, vcc
1021; GFX1064-NEXT:    s_cbranch_execnz .LBB15_3
1022; GFX1064-NEXT:  .LBB15_2:
1023; GFX1064-NEXT:    v_cvt_f32_u32_e32 v0, s4
1024; GFX1064-NEXT:    s_sub_i32 s1, 0, s4
1025; GFX1064-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1026; GFX1064-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1027; GFX1064-NEXT:    v_cvt_u32_f32_e32 v0, v0
1028; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
1029; GFX1064-NEXT:    s_mul_i32 s1, s1, s0
1030; GFX1064-NEXT:    s_mul_hi_u32 s1, s0, s1
1031; GFX1064-NEXT:    s_add_i32 s0, s0, s1
1032; GFX1064-NEXT:    s_mul_hi_u32 s0, s6, s0
1033; GFX1064-NEXT:    s_mul_i32 s1, s0, s4
1034; GFX1064-NEXT:    s_add_i32 s5, s0, 1
1035; GFX1064-NEXT:    s_sub_i32 s1, s6, s1
1036; GFX1064-NEXT:    s_sub_i32 s6, s1, s4
1037; GFX1064-NEXT:    s_cmp_ge_u32 s1, s4
1038; GFX1064-NEXT:    s_cselect_b32 s0, s5, s0
1039; GFX1064-NEXT:    s_cselect_b32 s1, s6, s1
1040; GFX1064-NEXT:    s_add_i32 s5, s0, 1
1041; GFX1064-NEXT:    s_cmp_ge_u32 s1, s4
1042; GFX1064-NEXT:    s_mov_b32 s1, 0
1043; GFX1064-NEXT:    s_cselect_b32 s0, s5, s0
1044; GFX1064-NEXT:    v_mov_b32_e32 v0, s0
1045; GFX1064-NEXT:    v_mov_b32_e32 v1, s1
1046; GFX1064-NEXT:  .LBB15_3:
1047; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
1048; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] offset:16
1049; GFX1064-NEXT:    s_endpgm
1050; GFX1064-NEXT:  .LBB15_4:
1051; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
1052; GFX1064-NEXT:    s_branch .LBB15_2
1053bb:
1054  %tmp = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 1
1055  %tmp1 = load i64, ptr addrspace(1) %tmp, align 8
1056  %tmp2 = load i64, ptr addrspace(1) %arg, align 8
1057  %tmp3 = udiv i64 %tmp1, %tmp2
1058  %tmp4 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 2
1059  store i64 %tmp3, ptr addrspace(1) %tmp4, align 8
1060  ret void
1061}
1062
1063define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1064; GFX1032-LABEL: test_div_scale_f32:
1065; GFX1032:       ; %bb.0:
1066; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1067; GFX1032-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1068; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1069; GFX1032-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
1070; GFX1032-NEXT:    s_waitcnt vmcnt(0)
1071; GFX1032-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
1072; GFX1032-NEXT:    s_waitcnt vmcnt(0)
1073; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
1074; GFX1032-NEXT:    v_div_scale_f32 v1, s2, v2, v2, v1
1075; GFX1032-NEXT:    global_store_dword v0, v1, s[0:1]
1076; GFX1032-NEXT:    s_endpgm
1077;
1078; GFX1064-LABEL: test_div_scale_f32:
1079; GFX1064:       ; %bb.0:
1080; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1081; GFX1064-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1082; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1083; GFX1064-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
1084; GFX1064-NEXT:    s_waitcnt vmcnt(0)
1085; GFX1064-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
1086; GFX1064-NEXT:    s_waitcnt vmcnt(0)
1087; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
1088; GFX1064-NEXT:    v_div_scale_f32 v1, s[2:3], v2, v2, v1
1089; GFX1064-NEXT:    global_store_dword v0, v1, s[0:1]
1090; GFX1064-NEXT:    s_endpgm
1091  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
1092  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
1093  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
1094
1095  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
1096  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
1097
1098  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
1099  %result0 = extractvalue { float, i1 } %result, 0
1100  store float %result0, ptr addrspace(1) %out, align 4
1101  ret void
1102}
1103
1104define amdgpu_kernel void @test_div_scale_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) #0 {
1105; GFX1032-LABEL: test_div_scale_f64:
1106; GFX1032:       ; %bb.0:
1107; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1108; GFX1032-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1109; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1110; GFX1032-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc
1111; GFX1032-NEXT:    s_waitcnt vmcnt(0)
1112; GFX1032-NEXT:    global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc
1113; GFX1032-NEXT:    s_waitcnt vmcnt(0)
1114; GFX1032-NEXT:    v_div_scale_f64 v[0:1], s0, v[0:1], v[2:3], v[0:1]
1115; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1116; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
1117; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1118; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1119; GFX1032-NEXT:    s_endpgm
1120;
1121; GFX1064-LABEL: test_div_scale_f64:
1122; GFX1064:       ; %bb.0:
1123; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1124; GFX1064-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1125; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1126; GFX1064-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc
1127; GFX1064-NEXT:    s_waitcnt vmcnt(0)
1128; GFX1064-NEXT:    global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc
1129; GFX1064-NEXT:    s_waitcnt vmcnt(0)
1130; GFX1064-NEXT:    v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[2:3], v[0:1]
1131; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1132; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
1133; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1134; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1135; GFX1064-NEXT:    s_endpgm
1136  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
1137  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
1138  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
1139
1140  %a = load volatile double, ptr addrspace(1) %gep.0, align 8
1141  %b = load volatile double, ptr addrspace(1) %gep.1, align 8
1142
1143  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
1144  %result0 = extractvalue { double, i1 } %result, 0
1145  store double %result0, ptr addrspace(1) %out, align 8
1146  ret void
1147}
1148
1149define i64 @test_mad_i64_i32(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
1150; GFX1032-LABEL: test_mad_i64_i32:
1151; GFX1032:       ; %bb.0:
1152; GFX1032-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1153; GFX1032-NEXT:    v_mad_i64_i32 v[0:1], s4, v0, v1, v[2:3]
1154; GFX1032-NEXT:    s_setpc_b64 s[30:31]
1155;
1156; GFX1064-LABEL: test_mad_i64_i32:
1157; GFX1064:       ; %bb.0:
1158; GFX1064-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1159; GFX1064-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
1160; GFX1064-NEXT:    s_setpc_b64 s[30:31]
1161  %sext0 = sext i32 %arg0 to i64
1162  %sext1 = sext i32 %arg1 to i64
1163  %mul = mul i64 %sext0, %sext1
1164  %mad = add i64 %mul, %arg2
1165  ret i64 %mad
1166}
1167
1168define i64 @test_mad_u64_u32(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
1169; GFX1032-LABEL: test_mad_u64_u32:
1170; GFX1032:       ; %bb.0:
1171; GFX1032-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1172; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s4, v0, v1, v[2:3]
1173; GFX1032-NEXT:    s_setpc_b64 s[30:31]
1174;
1175; GFX1064-LABEL: test_mad_u64_u32:
1176; GFX1064:       ; %bb.0:
1177; GFX1064-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1178; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
1179; GFX1064-NEXT:    s_setpc_b64 s[30:31]
1180  %sext0 = zext i32 %arg0 to i64
1181  %sext1 = zext i32 %arg1 to i64
1182  %mul = mul i64 %sext0, %sext1
1183  %mad = add i64 %mul, %arg2
1184  ret i64 %mad
1185}
1186
1187define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, float %b, float %c, i1 %d) nounwind {
1188; GFX1032-LABEL: test_div_fmas_f32:
1189; GFX1032:       ; %bb.0:
1190; GFX1032-NEXT:    s_clause 0x1
1191; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
1192; GFX1032-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
1193; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
1194; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1195; GFX1032-NEXT:    v_mov_b32_e32 v0, s1
1196; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
1197; GFX1032-NEXT:    s_bitcmp1_b32 s3, 0
1198; GFX1032-NEXT:    s_cselect_b32 vcc_lo, -1, 0
1199; GFX1032-NEXT:    v_div_fmas_f32 v0, s0, v0, v1
1200; GFX1032-NEXT:    global_store_dword v2, v0, s[6:7]
1201; GFX1032-NEXT:    s_endpgm
1202;
1203; GFX1064-LABEL: test_div_fmas_f32:
1204; GFX1064:       ; %bb.0:
1205; GFX1064-NEXT:    s_clause 0x1
1206; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
1207; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
1208; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
1209; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1210; GFX1064-NEXT:    v_mov_b32_e32 v0, s1
1211; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
1212; GFX1064-NEXT:    s_bitcmp1_b32 s3, 0
1213; GFX1064-NEXT:    s_cselect_b64 vcc, -1, 0
1214; GFX1064-NEXT:    v_div_fmas_f32 v0, s0, v0, v1
1215; GFX1064-NEXT:    global_store_dword v2, v0, s[6:7]
1216; GFX1064-NEXT:    s_endpgm
1217  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
1218  store float %result, ptr addrspace(1) %out, align 4
1219  ret void
1220}
1221
1222define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, double %b, double %c, i1 %d) nounwind {
1223; GFX1032-LABEL: test_div_fmas_f64:
1224; GFX1032:       ; %bb.0:
1225; GFX1032-NEXT:    s_clause 0x1
1226; GFX1032-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
1227; GFX1032-NEXT:    s_load_dword s0, s[4:5], 0x44
1228; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1229; GFX1032-NEXT:    v_mov_b32_e32 v0, s12
1230; GFX1032-NEXT:    v_mov_b32_e32 v1, s13
1231; GFX1032-NEXT:    v_mov_b32_e32 v2, s14
1232; GFX1032-NEXT:    v_mov_b32_e32 v3, s15
1233; GFX1032-NEXT:    s_bitcmp1_b32 s0, 0
1234; GFX1032-NEXT:    s_cselect_b32 vcc_lo, -1, 0
1235; GFX1032-NEXT:    v_div_fmas_f64 v[0:1], s[10:11], v[0:1], v[2:3]
1236; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
1237; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
1238; GFX1032-NEXT:    s_endpgm
1239;
1240; GFX1064-LABEL: test_div_fmas_f64:
1241; GFX1064:       ; %bb.0:
1242; GFX1064-NEXT:    s_clause 0x1
1243; GFX1064-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
1244; GFX1064-NEXT:    s_load_dword s0, s[4:5], 0x44
1245; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1246; GFX1064-NEXT:    v_mov_b32_e32 v0, s12
1247; GFX1064-NEXT:    v_mov_b32_e32 v1, s13
1248; GFX1064-NEXT:    v_mov_b32_e32 v2, s14
1249; GFX1064-NEXT:    v_mov_b32_e32 v3, s15
1250; GFX1064-NEXT:    s_bitcmp1_b32 s0, 0
1251; GFX1064-NEXT:    s_cselect_b64 vcc, -1, 0
1252; GFX1064-NEXT:    v_div_fmas_f64 v[0:1], s[10:11], v[0:1], v[2:3]
1253; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
1254; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
1255; GFX1064-NEXT:    s_endpgm
1256  %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
1257  store double %result, ptr addrspace(1) %out, align 8
1258  ret void
1259}
1260
1261
1262
1263define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) #0 {
1264; GFX1032-LABEL: test_div_fmas_f32_i1_phi_vcc:
1265; GFX1032:       ; %bb.0: ; %entry
1266; GFX1032-NEXT:    s_clause 0x1
1267; GFX1032-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
1268; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
1269; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1270; GFX1032-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
1271; GFX1032-NEXT:    s_mov_b32 vcc_lo, 0
1272; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1273; GFX1032-NEXT:    global_load_dwordx3 v[1:3], v1, s[10:11]
1274; GFX1032-NEXT:    s_and_saveexec_b32 s1, s0
1275; GFX1032-NEXT:    s_cbranch_execz .LBB22_2
1276; GFX1032-NEXT:  ; %bb.1: ; %bb
1277; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
1278; GFX1032-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
1279; GFX1032-NEXT:    s_waitcnt vmcnt(0)
1280; GFX1032-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
1281; GFX1032-NEXT:    s_and_b32 vcc_lo, vcc_lo, exec_lo
1282; GFX1032-NEXT:  .LBB22_2: ; %exit
1283; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1284; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1285; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
1286; GFX1032-NEXT:    s_waitcnt vmcnt(0)
1287; GFX1032-NEXT:    v_div_fmas_f32 v1, v1, v2, v3
1288; GFX1032-NEXT:    global_store_dword v0, v1, s[8:9] offset:8
1289; GFX1032-NEXT:    s_endpgm
1290;
1291; GFX1064-LABEL: test_div_fmas_f32_i1_phi_vcc:
1292; GFX1064:       ; %bb.0: ; %entry
1293; GFX1064-NEXT:    s_clause 0x1
1294; GFX1064-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
1295; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1296; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1297; GFX1064-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
1298; GFX1064-NEXT:    s_mov_b64 vcc, 0
1299; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1300; GFX1064-NEXT:    global_load_dwordx3 v[1:3], v1, s[10:11]
1301; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
1302; GFX1064-NEXT:    s_cbranch_execz .LBB22_2
1303; GFX1064-NEXT:  ; %bb.1: ; %bb
1304; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
1305; GFX1064-NEXT:    global_load_dword v0, v0, s[6:7] glc dlc
1306; GFX1064-NEXT:    s_waitcnt vmcnt(0)
1307; GFX1064-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1308; GFX1064-NEXT:    s_and_b64 vcc, vcc, exec
1309; GFX1064-NEXT:  .LBB22_2: ; %exit
1310; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1311; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
1312; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
1313; GFX1064-NEXT:    s_waitcnt vmcnt(0)
1314; GFX1064-NEXT:    v_div_fmas_f32 v1, v1, v2, v3
1315; GFX1064-NEXT:    global_store_dword v0, v1, s[8:9] offset:8
1316; GFX1064-NEXT:    s_endpgm
1317entry:
1318  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
1319  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 2
1320  %gep.a = getelementptr float, ptr addrspace(1) %in, i32 %tid
1321  %gep.b = getelementptr float, ptr addrspace(1) %gep.a, i32 1
1322  %gep.c = getelementptr float, ptr addrspace(1) %gep.a, i32 2
1323
1324  %a = load float, ptr addrspace(1) %gep.a
1325  %b = load float, ptr addrspace(1) %gep.b
1326  %c = load float, ptr addrspace(1) %gep.c
1327
1328  %cmp0 = icmp eq i32 %tid, 0
1329  br i1 %cmp0, label %bb, label %exit
1330
1331bb:
1332  %val = load volatile i32, ptr addrspace(1) %dummy
1333  %cmp1 = icmp ne i32 %val, 0
1334  br label %exit
1335
1336exit:
1337  %cond = phi i1 [false, %entry], [%cmp1, %bb]
1338  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone
1339  store float %result, ptr addrspace(1) %gep.out, align 4
1340  ret void
1341}
1342
1343
1344define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) #0 {
1345; GFX1032-LABEL: fdiv_f32:
1346; GFX1032:       ; %bb.0: ; %entry
1347; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1348; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1349; GFX1032-NEXT:    v_div_scale_f32 v0, s4, s3, s3, s2
1350; GFX1032-NEXT:    v_rcp_f32_e32 v1, v0
1351; GFX1032-NEXT:    v_fma_f32 v2, -v0, v1, 1.0
1352; GFX1032-NEXT:    v_fmac_f32_e32 v1, v2, v1
1353; GFX1032-NEXT:    v_div_scale_f32 v2, vcc_lo, s2, s3, s2
1354; GFX1032-NEXT:    v_mul_f32_e32 v3, v2, v1
1355; GFX1032-NEXT:    v_fma_f32 v4, -v0, v3, v2
1356; GFX1032-NEXT:    v_fmac_f32_e32 v3, v4, v1
1357; GFX1032-NEXT:    v_fma_f32 v0, -v0, v3, v2
1358; GFX1032-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
1359; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1360; GFX1032-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
1361; GFX1032-NEXT:    global_store_dword v1, v0, s[0:1]
1362; GFX1032-NEXT:    s_endpgm
1363;
1364; GFX1064-LABEL: fdiv_f32:
1365; GFX1064:       ; %bb.0: ; %entry
1366; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1367; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1368; GFX1064-NEXT:    v_div_scale_f32 v0, s[4:5], s3, s3, s2
1369; GFX1064-NEXT:    v_rcp_f32_e32 v1, v0
1370; GFX1064-NEXT:    v_fma_f32 v2, -v0, v1, 1.0
1371; GFX1064-NEXT:    v_fmac_f32_e32 v1, v2, v1
1372; GFX1064-NEXT:    v_div_scale_f32 v2, vcc, s2, s3, s2
1373; GFX1064-NEXT:    v_mul_f32_e32 v3, v2, v1
1374; GFX1064-NEXT:    v_fma_f32 v4, -v0, v3, v2
1375; GFX1064-NEXT:    v_fmac_f32_e32 v3, v4, v1
1376; GFX1064-NEXT:    v_fma_f32 v0, -v0, v3, v2
1377; GFX1064-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
1378; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1379; GFX1064-NEXT:    v_div_fixup_f32 v0, v0, s3, s2
1380; GFX1064-NEXT:    global_store_dword v1, v0, s[0:1]
1381; GFX1064-NEXT:    s_endpgm
1382entry:
1383  %fdiv = fdiv float %a, %b
1384  store float %fdiv, ptr addrspace(1) %out
1385  ret void
1386}
1387
1388define amdgpu_kernel void @test_br_cc_f16(
1389; GFX1032-LABEL: test_br_cc_f16:
1390; GFX1032:       ; %bb.0: ; %entry
1391; GFX1032-NEXT:    s_clause 0x1
1392; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1393; GFX1032-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1394; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
1395; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1396; GFX1032-NEXT:    s_clause 0x1
1397; GFX1032-NEXT:    global_load_ushort v1, v0, s[2:3]
1398; GFX1032-NEXT:    global_load_ushort v2, v0, s[6:7]
1399; GFX1032-NEXT:    s_waitcnt vmcnt(0)
1400; GFX1032-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v2
1401; GFX1032-NEXT:    s_cbranch_vccnz .LBB24_2
1402; GFX1032-NEXT:  ; %bb.1: ; %one
1403; GFX1032-NEXT:    global_store_short v0, v1, s[0:1]
1404; GFX1032-NEXT:    s_endpgm
1405; GFX1032-NEXT:  .LBB24_2: ; %two
1406; GFX1032-NEXT:    global_store_short v0, v2, s[0:1]
1407; GFX1032-NEXT:    s_endpgm
1408;
1409; GFX1064-LABEL: test_br_cc_f16:
1410; GFX1064:       ; %bb.0: ; %entry
1411; GFX1064-NEXT:    s_clause 0x1
1412; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1413; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1414; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
1415; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1416; GFX1064-NEXT:    s_clause 0x1
1417; GFX1064-NEXT:    global_load_ushort v1, v0, s[2:3]
1418; GFX1064-NEXT:    global_load_ushort v2, v0, s[6:7]
1419; GFX1064-NEXT:    s_waitcnt vmcnt(0)
1420; GFX1064-NEXT:    v_cmp_nlt_f16_e32 vcc, v1, v2
1421; GFX1064-NEXT:    s_cbranch_vccnz .LBB24_2
1422; GFX1064-NEXT:  ; %bb.1: ; %one
1423; GFX1064-NEXT:    global_store_short v0, v1, s[0:1]
1424; GFX1064-NEXT:    s_endpgm
1425; GFX1064-NEXT:  .LBB24_2: ; %two
1426; GFX1064-NEXT:    global_store_short v0, v2, s[0:1]
1427; GFX1064-NEXT:    s_endpgm
1428    ptr addrspace(1) %r,
1429    ptr addrspace(1) %a,
1430    ptr addrspace(1) %b) {
1431entry:
1432  %a.val = load half, ptr addrspace(1) %a
1433  %b.val = load half, ptr addrspace(1) %b
1434  %fcmp = fcmp olt half %a.val, %b.val
1435  br i1 %fcmp, label %one, label %two
1436
1437one:
1438  store half %a.val, ptr addrspace(1) %r
1439  ret void
1440
1441two:
1442  store half %b.val, ptr addrspace(1) %r
1443  ret void
1444}
1445
1446define amdgpu_kernel void @test_brcc_i1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %val) #0 {
1447; GCN-LABEL: test_brcc_i1:
1448; GCN:       ; %bb.0:
1449; GCN-NEXT:    s_load_dword s0, s[4:5], 0x34
1450; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1451; GCN-NEXT:    s_bitcmp0_b32 s0, 0
1452; GCN-NEXT:    s_cbranch_scc1 .LBB25_2
1453; GCN-NEXT:  ; %bb.1: ; %store
1454; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1455; GCN-NEXT:    v_mov_b32_e32 v0, 0
1456; GCN-NEXT:    v_mov_b32_e32 v1, 0xde
1457; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1458; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
1459; GCN-NEXT:  .LBB25_2: ; %end
1460; GCN-NEXT:    s_endpgm
1461  %cmp0 = icmp ne i1 %val, 0
1462  br i1 %cmp0, label %store, label %end
1463
1464store:
1465  store i32 222, ptr addrspace(1) %out
1466  ret void
1467
1468end:
1469  ret void
1470}
1471
1472define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) #0 {
1473; GFX1032-LABEL: test_preserve_condition_undef_flag:
1474; GFX1032:       ; %bb.0: ; %bb0
1475; GFX1032-NEXT:    s_clause 0x1
1476; GFX1032-NEXT:    s_load_dword s0, s[4:5], 0x2c
1477; GFX1032-NEXT:    s_load_dword s1, s[4:5], 0x24
1478; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1479; GFX1032-NEXT:    v_cmp_nlt_f32_e64 s2, s0, 1.0
1480; GFX1032-NEXT:    v_cmp_nlt_f32_e64 s1, s1, 1.0
1481; GFX1032-NEXT:    v_cmp_ngt_f32_e64 s0, s0, 0
1482; GFX1032-NEXT:    s_or_b32 s1, s2, s1
1483; GFX1032-NEXT:    s_or_b32 s0, s1, s0
1484; GFX1032-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
1485; GFX1032-NEXT:    s_cbranch_vccnz .LBB26_2
1486; GFX1032-NEXT:  ; %bb.1: ; %bb1
1487; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
1488; GFX1032-NEXT:    global_store_dword v[0:1], v0, off
1489; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1490; GFX1032-NEXT:  .LBB26_2: ; %bb2
1491; GFX1032-NEXT:    s_endpgm
1492;
1493; GFX1064-LABEL: test_preserve_condition_undef_flag:
1494; GFX1064:       ; %bb.0: ; %bb0
1495; GFX1064-NEXT:    s_clause 0x1
1496; GFX1064-NEXT:    s_load_dword s6, s[4:5], 0x2c
1497; GFX1064-NEXT:    s_load_dword s2, s[4:5], 0x24
1498; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1499; GFX1064-NEXT:    v_cmp_nlt_f32_e64 s[0:1], s6, 1.0
1500; GFX1064-NEXT:    v_cmp_nlt_f32_e64 s[2:3], s2, 1.0
1501; GFX1064-NEXT:    v_cmp_ngt_f32_e64 s[4:5], s6, 0
1502; GFX1064-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
1503; GFX1064-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
1504; GFX1064-NEXT:    s_and_b64 vcc, exec, s[0:1]
1505; GFX1064-NEXT:    s_cbranch_vccnz .LBB26_2
1506; GFX1064-NEXT:  ; %bb.1: ; %bb1
1507; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
1508; GFX1064-NEXT:    global_store_dword v[0:1], v0, off
1509; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1510; GFX1064-NEXT:  .LBB26_2: ; %bb2
1511; GFX1064-NEXT:    s_endpgm
1512bb0:
1513  %tmp = icmp sgt i32 %arg1, 4
1514  %undef = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef)
1515  %tmp4 = select i1 %undef, float %arg, float 1.000000e+00
1516  %tmp5 = fcmp ogt float %arg2, 0.000000e+00
1517  %tmp6 = fcmp olt float %arg2, 1.000000e+00
1518  %tmp7 = fcmp olt float %arg, %tmp4
1519  %tmp8 = and i1 %tmp5, %tmp6
1520  %tmp9 = and i1 %tmp8, %tmp7
1521  br i1 %tmp9, label %bb1, label %bb2
1522
1523bb1:
1524  store volatile i32 0, ptr addrspace(1) undef
1525  br label %bb2
1526
1527bb2:
1528  ret void
1529}
1530
1531define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 {
1532; GFX1032-LABEL: test_invert_true_phi_cond_break_loop:
1533; GFX1032:       ; %bb.0: ; %bb
1534; GFX1032-NEXT:    s_load_dword s0, s[4:5], 0x24
1535; GFX1032-NEXT:    ; implicit-def: $sgpr1
1536; GFX1032-NEXT:    ; implicit-def: $sgpr2
1537; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1538; GFX1032-NEXT:    v_subrev_nc_u32_e32 v0, s0, v0
1539; GFX1032-NEXT:    s_mov_b32 s0, 0
1540; GFX1032-NEXT:    s_branch .LBB27_2
1541; GFX1032-NEXT:  .LBB27_1: ; %Flow
1542; GFX1032-NEXT:    ; in Loop: Header=BB27_2 Depth=1
1543; GFX1032-NEXT:    s_xor_b32 s3, s1, -1
1544; GFX1032-NEXT:    s_add_i32 s2, s2, 1
1545; GFX1032-NEXT:    s_and_b32 s3, exec_lo, s3
1546; GFX1032-NEXT:    s_or_b32 s0, s3, s0
1547; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
1548; GFX1032-NEXT:    s_cbranch_execz .LBB27_4
1549; GFX1032-NEXT:  .LBB27_2: ; %bb1
1550; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
1551; GFX1032-NEXT:    s_or_b32 s1, s1, exec_lo
1552; GFX1032-NEXT:    s_cmp_gt_i32 s2, -1
1553; GFX1032-NEXT:    s_cbranch_scc1 .LBB27_1
1554; GFX1032-NEXT:  ; %bb.3: ; %bb4
1555; GFX1032-NEXT:    ; in Loop: Header=BB27_2 Depth=1
1556; GFX1032-NEXT:    global_load_dword v1, v[0:1], off glc dlc
1557; GFX1032-NEXT:    s_waitcnt vmcnt(0)
1558; GFX1032-NEXT:    s_andn2_b32 s1, s1, exec_lo
1559; GFX1032-NEXT:    v_cmp_ge_i32_e32 vcc_lo, v0, v1
1560; GFX1032-NEXT:    s_and_b32 s3, vcc_lo, exec_lo
1561; GFX1032-NEXT:    s_or_b32 s1, s1, s3
1562; GFX1032-NEXT:    s_branch .LBB27_1
1563; GFX1032-NEXT:  .LBB27_4: ; %bb9
1564; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1565; GFX1032-NEXT:    v_mov_b32_e32 v0, 7
1566; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1567; GFX1032-NEXT:    ds_write_b32 v0, v0
1568; GFX1032-NEXT:    s_endpgm
1569;
1570; GFX1064-LABEL: test_invert_true_phi_cond_break_loop:
1571; GFX1064:       ; %bb.0: ; %bb
1572; GFX1064-NEXT:    s_load_dword s0, s[4:5], 0x24
1573; GFX1064-NEXT:    ; implicit-def: $sgpr2_sgpr3
1574; GFX1064-NEXT:    ; implicit-def: $sgpr4
1575; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1576; GFX1064-NEXT:    v_subrev_nc_u32_e32 v0, s0, v0
1577; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
1578; GFX1064-NEXT:    s_branch .LBB27_2
1579; GFX1064-NEXT:  .LBB27_1: ; %Flow
1580; GFX1064-NEXT:    ; in Loop: Header=BB27_2 Depth=1
1581; GFX1064-NEXT:    s_xor_b64 s[6:7], s[2:3], -1
1582; GFX1064-NEXT:    s_add_i32 s4, s4, 1
1583; GFX1064-NEXT:    s_and_b64 s[6:7], exec, s[6:7]
1584; GFX1064-NEXT:    s_or_b64 s[0:1], s[6:7], s[0:1]
1585; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1586; GFX1064-NEXT:    s_cbranch_execz .LBB27_4
1587; GFX1064-NEXT:  .LBB27_2: ; %bb1
1588; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
1589; GFX1064-NEXT:    s_or_b64 s[2:3], s[2:3], exec
1590; GFX1064-NEXT:    s_cmp_gt_i32 s4, -1
1591; GFX1064-NEXT:    s_cbranch_scc1 .LBB27_1
1592; GFX1064-NEXT:  ; %bb.3: ; %bb4
1593; GFX1064-NEXT:    ; in Loop: Header=BB27_2 Depth=1
1594; GFX1064-NEXT:    global_load_dword v1, v[0:1], off glc dlc
1595; GFX1064-NEXT:    s_waitcnt vmcnt(0)
1596; GFX1064-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
1597; GFX1064-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
1598; GFX1064-NEXT:    s_and_b64 s[6:7], vcc, exec
1599; GFX1064-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
1600; GFX1064-NEXT:    s_branch .LBB27_1
1601; GFX1064-NEXT:  .LBB27_4: ; %bb9
1602; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
1603; GFX1064-NEXT:    v_mov_b32_e32 v0, 7
1604; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1605; GFX1064-NEXT:    ds_write_b32 v0, v0
1606; GFX1064-NEXT:    s_endpgm
1607bb:
1608  %id = call i32 @llvm.amdgcn.workitem.id.x()
1609  %tmp = sub i32 %id, %arg
1610  br label %bb1
1611
1612bb1:                                              ; preds = %Flow, %bb
1613  %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
1614  %lsr.iv.next = add i32 %lsr.iv, 1
1615  %cmp0 = icmp slt i32 %lsr.iv.next, 0
1616  br i1 %cmp0, label %bb4, label %Flow
1617
1618bb4:                                              ; preds = %bb1
1619  %load = load volatile i32, ptr addrspace(1) undef, align 4
1620  %cmp1 = icmp sge i32 %tmp, %load
1621  br label %Flow
1622
1623Flow:                                             ; preds = %bb4, %bb1
1624  %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
1625  %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ]
1626  br i1 %tmp3, label %bb1, label %bb9
1627
1628bb9:                                              ; preds = %Flow
1629  store volatile i32 7, ptr addrspace(3) undef
1630  ret void
1631}
1632
1633define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1) %out) #0 {
1634; GFX1032-LABEL: test_movrels_extract_neg_offset_vgpr:
1635; GFX1032:       ; %bb.0: ; %entry
1636; GFX1032-NEXT:    v_add_nc_u32_e32 v0, 0xfffffe00, v0
1637; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1638; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
1639; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
1640; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
1641; GFX1032-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 2, v0
1642; GFX1032-NEXT:    v_cndmask_b32_e32 v1, 2, v1, vcc_lo
1643; GFX1032-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 3, v0
1644; GFX1032-NEXT:    v_cndmask_b32_e32 v0, 3, v1, vcc_lo
1645; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1646; GFX1032-NEXT:    global_store_dword v2, v0, s[0:1]
1647; GFX1032-NEXT:    s_endpgm
1648;
1649; GFX1064-LABEL: test_movrels_extract_neg_offset_vgpr:
1650; GFX1064:       ; %bb.0: ; %entry
1651; GFX1064-NEXT:    v_add_nc_u32_e32 v0, 0xfffffe00, v0
1652; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1653; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
1654; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
1655; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
1656; GFX1064-NEXT:    v_cmp_ne_u32_e32 vcc, 2, v0
1657; GFX1064-NEXT:    v_cndmask_b32_e32 v1, 2, v1, vcc
1658; GFX1064-NEXT:    v_cmp_ne_u32_e32 vcc, 3, v0
1659; GFX1064-NEXT:    v_cndmask_b32_e32 v0, 3, v1, vcc
1660; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1661; GFX1064-NEXT:    global_store_dword v2, v0, s[0:1]
1662; GFX1064-NEXT:    s_endpgm
1663entry:
1664  %id = call i32 @llvm.amdgcn.workitem.id.x() #1
1665  %index = add i32 %id, -512
1666  %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
1667  store i32 %value, ptr addrspace(1) %out
1668  ret void
1669}
1670
1671define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 {
1672; GFX1032-LABEL: test_set_inactive:
1673; GFX1032:       ; %bb.0:
1674; GFX1032-NEXT:    s_clause 0x1
1675; GFX1032-NEXT:    s_load_dword s2, s[4:5], 0x2c
1676; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1677; GFX1032-NEXT:    s_or_saveexec_b32 s3, -1
1678; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1679; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 42, s2, s3
1680; GFX1032-NEXT:    s_mov_b32 exec_lo, s3
1681; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1682; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
1683; GFX1032-NEXT:    global_store_dword v1, v2, s[0:1]
1684; GFX1032-NEXT:    s_endpgm
1685;
1686; GFX1064-LABEL: test_set_inactive:
1687; GFX1064:       ; %bb.0:
1688; GFX1064-NEXT:    s_clause 0x1
1689; GFX1064-NEXT:    s_load_dword s6, s[4:5], 0x2c
1690; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1691; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
1692; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1693; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 42, s6, s[2:3]
1694; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
1695; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1696; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
1697; GFX1064-NEXT:    global_store_dword v1, v2, s[0:1]
1698; GFX1064-NEXT:    s_endpgm
1699  %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42)
1700  %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
1701  store i32 %tmp, ptr addrspace(1) %out
1702  ret void
1703}
1704
1705define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) #0 {
1706; GFX1032-LABEL: test_set_inactive_64:
1707; GFX1032:       ; %bb.0:
1708; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1709; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
1710; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1711; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s3, s4
1712; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 0, s2, s4
1713; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
1714; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
1715; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
1716; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
1717; GFX1032-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
1718; GFX1032-NEXT:    s_endpgm
1719;
1720; GFX1064-LABEL: test_set_inactive_64:
1721; GFX1064:       ; %bb.0:
1722; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1723; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
1724; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1725; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s3, s[4:5]
1726; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 0, s2, s[4:5]
1727; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
1728; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
1729; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
1730; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
1731; GFX1064-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
1732; GFX1064-NEXT:    s_endpgm
1733  %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0)
1734  %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
1735  store i64 %tmp, ptr addrspace(1) %out
1736  ret void
1737}
1738
1739define amdgpu_ps void @test_kill_i1_terminator_float() #0 {
1740; GFX1032-LABEL: test_kill_i1_terminator_float:
1741; GFX1032:       ; %bb.0:
1742; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, exec_lo
1743; GFX1032-NEXT:    s_cbranch_scc0 .LBB31_1
1744; GFX1032-NEXT:    s_endpgm
1745; GFX1032-NEXT:  .LBB31_1:
1746; GFX1032-NEXT:    s_mov_b32 exec_lo, 0
1747; GFX1032-NEXT:    exp null off, off, off, off done vm
1748; GFX1032-NEXT:    s_endpgm
1749;
1750; GFX1064-LABEL: test_kill_i1_terminator_float:
1751; GFX1064:       ; %bb.0:
1752; GFX1064-NEXT:    s_andn2_b64 exec, exec, exec
1753; GFX1064-NEXT:    s_cbranch_scc0 .LBB31_1
1754; GFX1064-NEXT:    s_endpgm
1755; GFX1064-NEXT:  .LBB31_1:
1756; GFX1064-NEXT:    s_mov_b64 exec, 0
1757; GFX1064-NEXT:    exp null off, off, off, off done vm
1758; GFX1064-NEXT:    s_endpgm
1759  call void @llvm.amdgcn.kill(i1 false)
1760  ret void
1761}
1762
1763define amdgpu_gs void @test_kill_i1_terminator_i1(i32 %a, i32 %b, i32 %c, i32 %d) #0 {
1764; GFX1032-LABEL: test_kill_i1_terminator_i1:
1765; GFX1032:       ; %bb.0:
1766; GFX1032-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v0, v1
1767; GFX1032-NEXT:    v_cmp_lt_i32_e64 s0, v2, v3
1768; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
1769; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
1770; GFX1032-NEXT:    s_andn2_b32 s0, exec_lo, s0
1771; GFX1032-NEXT:    s_andn2_b32 s1, s1, s0
1772; GFX1032-NEXT:    s_and_b32 exec_lo, exec_lo, s1
1773; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
1774; GFX1032-NEXT:    exp mrt0 off, off, off, off
1775; GFX1032-NEXT:    s_endpgm
1776; GFX1032-NEXT:  ; %bb.1:
1777; GFX1032-NEXT:    s_mov_b32 exec_lo, 0
1778; GFX1032-NEXT:    s_endpgm
1779;
1780; GFX1064-LABEL: test_kill_i1_terminator_i1:
1781; GFX1064:       ; %bb.0:
1782; GFX1064-NEXT:    v_cmp_lt_i32_e32 vcc, v0, v1
1783; GFX1064-NEXT:    v_cmp_lt_i32_e64 s[0:1], v2, v3
1784; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1785; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1786; GFX1064-NEXT:    s_andn2_b64 s[0:1], exec, s[0:1]
1787; GFX1064-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
1788; GFX1064-NEXT:    s_and_b64 exec, exec, s[2:3]
1789; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
1790; GFX1064-NEXT:    exp mrt0 off, off, off, off
1791; GFX1064-NEXT:    s_endpgm
1792; GFX1064-NEXT:  ; %bb.1:
1793; GFX1064-NEXT:    s_mov_b64 exec, 0
1794; GFX1064-NEXT:    s_endpgm
1795  %c1 = icmp slt i32 %a, %b
1796  %c2 = icmp slt i32 %c, %d
1797  %x = or i1 %c1, %c2
1798  call void @llvm.amdgcn.kill(i1 %x)
1799  call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false)
1800  ret void
1801}
1802
1803define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 {
1804; GFX1032-LABEL: test_loop_vcc:
1805; GFX1032:       ; %bb.0: ; %entry
1806; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
1807; GFX1032-NEXT:    s_wqm_b32 exec_lo, exec_lo
1808; GFX1032-NEXT:    v_mov_b32_e32 v7, v3
1809; GFX1032-NEXT:    v_mov_b32_e32 v6, v2
1810; GFX1032-NEXT:    v_mov_b32_e32 v5, v1
1811; GFX1032-NEXT:    v_mov_b32_e32 v4, v0
1812; GFX1032-NEXT:    v_mov_b32_e32 v8, 0
1813; GFX1032-NEXT:    s_branch .LBB33_2
1814; GFX1032-NEXT:  .LBB33_1: ; %body
1815; GFX1032-NEXT:    ; in Loop: Header=BB33_2 Depth=1
1816; GFX1032-NEXT:    image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
1817; GFX1032-NEXT:    v_add_f32_e32 v8, 2.0, v8
1818; GFX1032-NEXT:    s_cbranch_execz .LBB33_4
1819; GFX1032-NEXT:  .LBB33_2: ; %loop
1820; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
1821; GFX1032-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8
1822; GFX1032-NEXT:    s_waitcnt vmcnt(0)
1823; GFX1032-NEXT:    v_mov_b32_e32 v0, v4
1824; GFX1032-NEXT:    v_mov_b32_e32 v1, v5
1825; GFX1032-NEXT:    v_mov_b32_e32 v2, v6
1826; GFX1032-NEXT:    v_mov_b32_e32 v3, v7
1827; GFX1032-NEXT:    s_cbranch_vccz .LBB33_1
1828; GFX1032-NEXT:  ; %bb.3:
1829; GFX1032-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
1830; GFX1032-NEXT:    ; implicit-def: $vgpr8
1831; GFX1032-NEXT:  .LBB33_4: ; %break
1832; GFX1032-NEXT:    s_and_b32 exec_lo, exec_lo, s0
1833; GFX1032-NEXT:    s_waitcnt vmcnt(0)
1834; GFX1032-NEXT:    ; return to shader part epilog
1835;
1836; GFX1064-LABEL: test_loop_vcc:
1837; GFX1064:       ; %bb.0: ; %entry
1838; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
1839; GFX1064-NEXT:    s_wqm_b64 exec, exec
1840; GFX1064-NEXT:    v_mov_b32_e32 v7, v3
1841; GFX1064-NEXT:    v_mov_b32_e32 v6, v2
1842; GFX1064-NEXT:    v_mov_b32_e32 v5, v1
1843; GFX1064-NEXT:    v_mov_b32_e32 v4, v0
1844; GFX1064-NEXT:    v_mov_b32_e32 v8, 0
1845; GFX1064-NEXT:    s_branch .LBB33_2
1846; GFX1064-NEXT:  .LBB33_1: ; %body
1847; GFX1064-NEXT:    ; in Loop: Header=BB33_2 Depth=1
1848; GFX1064-NEXT:    image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
1849; GFX1064-NEXT:    v_add_f32_e32 v8, 2.0, v8
1850; GFX1064-NEXT:    s_cbranch_execz .LBB33_4
1851; GFX1064-NEXT:  .LBB33_2: ; %loop
1852; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
1853; GFX1064-NEXT:    v_cmp_lt_f32_e32 vcc, 0x40e00000, v8
1854; GFX1064-NEXT:    s_waitcnt vmcnt(0)
1855; GFX1064-NEXT:    v_mov_b32_e32 v0, v4
1856; GFX1064-NEXT:    v_mov_b32_e32 v1, v5
1857; GFX1064-NEXT:    v_mov_b32_e32 v2, v6
1858; GFX1064-NEXT:    v_mov_b32_e32 v3, v7
1859; GFX1064-NEXT:    s_cbranch_vccz .LBB33_1
1860; GFX1064-NEXT:  ; %bb.3:
1861; GFX1064-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
1862; GFX1064-NEXT:    ; implicit-def: $vgpr8
1863; GFX1064-NEXT:  .LBB33_4: ; %break
1864; GFX1064-NEXT:    s_and_b64 exec, exec, s[0:1]
1865; GFX1064-NEXT:    s_waitcnt vmcnt(0)
1866; GFX1064-NEXT:    ; return to shader part epilog
1867entry:
1868  br label %loop
1869
1870loop:
1871  %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ]
1872  %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
1873  %cc = fcmp ogt float %ctr.iv, 7.0
1874  br i1 %cc, label %break, label %body
1875
1876body:
1877  %c.iv0 = extractelement <4 x float> %c.iv, i32 0
1878  %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)
1879  %ctr.next = fadd float %ctr.iv, 2.0
1880  br label %loop
1881
1882break:
1883  ret <4 x float> %c.iv
1884}
1885
1886define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1, float %src0, float %src1) {
1887; GFX1032-LABEL: test_wwm1:
1888; GFX1032:       ; %bb.0: ; %main_body
1889; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
1890; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
1891; GFX1032-NEXT:    v_mov_b32_e32 v3, v0
1892; GFX1032-NEXT:    v_add_f32_e32 v2, v3, v2
1893; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
1894; GFX1032-NEXT:    v_mov_b32_e32 v0, v2
1895; GFX1032-NEXT:    ; return to shader part epilog
1896;
1897; GFX1064-LABEL: test_wwm1:
1898; GFX1064:       ; %bb.0: ; %main_body
1899; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
1900; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
1901; GFX1064-NEXT:    v_mov_b32_e32 v3, v0
1902; GFX1064-NEXT:    v_add_f32_e32 v2, v3, v2
1903; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
1904; GFX1064-NEXT:    v_mov_b32_e32 v0, v2
1905; GFX1064-NEXT:    ; return to shader part epilog
1906main_body:
1907  %out = fadd float %src0, %src1
1908  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
1909  ret float %out.0
1910}
1911
1912define amdgpu_ps float @test_wwm2(i32 inreg %idx) {
1913; GFX1032-LABEL: test_wwm2:
1914; GFX1032:       ; %bb.0: ; %main_body
1915; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1916; GFX1032-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
1917; GFX1032-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
1918; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
1919; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
1920; GFX1032-NEXT:    s_cbranch_execz .LBB35_2
1921; GFX1032-NEXT:  ; %bb.1: ; %if
1922; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1923; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
1924; GFX1032-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
1925; GFX1032-NEXT:    s_waitcnt vmcnt(0)
1926; GFX1032-NEXT:    v_add_f32_e32 v2, v1, v1
1927; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1928; GFX1032-NEXT:    v_mov_b32_e32 v0, v2
1929; GFX1032-NEXT:    v_add_f32_e32 v0, v1, v0
1930; GFX1032-NEXT:  .LBB35_2: ; %endif
1931; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1932; GFX1032-NEXT:    ; return to shader part epilog
1933;
1934; GFX1064-LABEL: test_wwm2:
1935; GFX1064:       ; %bb.0: ; %main_body
1936; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1937; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
1938; GFX1064-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
1939; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
1940; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1941; GFX1064-NEXT:    s_cbranch_execz .LBB35_2
1942; GFX1064-NEXT:  ; %bb.1: ; %if
1943; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
1944; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
1945; GFX1064-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
1946; GFX1064-NEXT:    s_waitcnt vmcnt(0)
1947; GFX1064-NEXT:    v_add_f32_e32 v2, v1, v1
1948; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
1949; GFX1064-NEXT:    v_mov_b32_e32 v0, v2
1950; GFX1064-NEXT:    v_add_f32_e32 v0, v1, v0
1951; GFX1064-NEXT:  .LBB35_2: ; %endif
1952; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
1953; GFX1064-NEXT:    ; return to shader part epilog
1954main_body:
1955  ; use mbcnt to make sure the branch is divergent
1956  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1957  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1958  %cc = icmp uge i32 %hi, 16
1959  br i1 %cc, label %endif, label %if
1960
1961if:
1962  %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
1963  %out = fadd float %src, %src
1964  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
1965  %out.1 = fadd float %src, %out.0
1966  br label %endif
1967
1968endif:
1969  %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
1970  ret float %out.2
1971}
1972
1973define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1, float %src0, float %src1) {
1974; GFX1032-LABEL: test_strict_wwm1:
1975; GFX1032:       ; %bb.0: ; %main_body
1976; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
1977; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
1978; GFX1032-NEXT:    v_mov_b32_e32 v3, v0
1979; GFX1032-NEXT:    v_add_f32_e32 v2, v3, v2
1980; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
1981; GFX1032-NEXT:    v_mov_b32_e32 v0, v2
1982; GFX1032-NEXT:    ; return to shader part epilog
1983;
1984; GFX1064-LABEL: test_strict_wwm1:
1985; GFX1064:       ; %bb.0: ; %main_body
1986; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
1987; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
1988; GFX1064-NEXT:    v_mov_b32_e32 v3, v0
1989; GFX1064-NEXT:    v_add_f32_e32 v2, v3, v2
1990; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
1991; GFX1064-NEXT:    v_mov_b32_e32 v0, v2
1992; GFX1064-NEXT:    ; return to shader part epilog
1993main_body:
1994  %out = fadd float %src0, %src1
1995  %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
1996  ret float %out.0
1997}
1998
1999define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx) {
2000; GFX1032-LABEL: test_strict_wwm2:
2001; GFX1032:       ; %bb.0: ; %main_body
2002; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2003; GFX1032-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2004; GFX1032-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
2005; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
2006; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
2007; GFX1032-NEXT:    s_cbranch_execz .LBB37_2
2008; GFX1032-NEXT:  ; %bb.1: ; %if
2009; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2010; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
2011; GFX1032-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2012; GFX1032-NEXT:    s_waitcnt vmcnt(0)
2013; GFX1032-NEXT:    v_add_f32_e32 v2, v1, v1
2014; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2015; GFX1032-NEXT:    v_mov_b32_e32 v0, v2
2016; GFX1032-NEXT:    v_add_f32_e32 v0, v1, v0
2017; GFX1032-NEXT:  .LBB37_2: ; %endif
2018; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2019; GFX1032-NEXT:    ; return to shader part epilog
2020;
2021; GFX1064-LABEL: test_strict_wwm2:
2022; GFX1064:       ; %bb.0: ; %main_body
2023; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2024; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2025; GFX1064-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
2026; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2027; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2028; GFX1064-NEXT:    s_cbranch_execz .LBB37_2
2029; GFX1064-NEXT:  ; %bb.1: ; %if
2030; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2031; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
2032; GFX1064-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2033; GFX1064-NEXT:    s_waitcnt vmcnt(0)
2034; GFX1064-NEXT:    v_add_f32_e32 v2, v1, v1
2035; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2036; GFX1064-NEXT:    v_mov_b32_e32 v0, v2
2037; GFX1064-NEXT:    v_add_f32_e32 v0, v1, v0
2038; GFX1064-NEXT:  .LBB37_2: ; %endif
2039; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
2040; GFX1064-NEXT:    ; return to shader part epilog
2041main_body:
2042  ; use mbcnt to make sure the branch is divergent
2043  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2044  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2045  %cc = icmp uge i32 %hi, 16
2046  br i1 %cc, label %endif, label %if
2047
2048if:
2049  %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
2050  %out = fadd float %src, %src
2051  %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2052  %out.1 = fadd float %src, %out.0
2053  br label %endif
2054
2055endif:
2056  %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
2057  ret float %out.2
2058}
2059
2060
2061define amdgpu_ps <4 x float> @test_wqm1(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #0 {
2062; GFX1032-LABEL: test_wqm1:
2063; GFX1032:       ; %bb.0: ; %main_body
2064; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
2065; GFX1032-NEXT:    s_wqm_b32 exec_lo, exec_lo
2066; GFX1032-NEXT:    s_mov_b32 m0, s3
2067; GFX1032-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
2068; GFX1032-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
2069; GFX1032-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
2070; GFX1032-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
2071; GFX1032-NEXT:    s_and_b32 exec_lo, exec_lo, s0
2072; GFX1032-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D
2073; GFX1032-NEXT:    s_waitcnt vmcnt(0)
2074; GFX1032-NEXT:    ; return to shader part epilog
2075;
2076; GFX1064-LABEL: test_wqm1:
2077; GFX1064:       ; %bb.0: ; %main_body
2078; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
2079; GFX1064-NEXT:    s_wqm_b64 exec, exec
2080; GFX1064-NEXT:    s_mov_b32 m0, s3
2081; GFX1064-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
2082; GFX1064-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
2083; GFX1064-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
2084; GFX1064-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
2085; GFX1064-NEXT:    s_and_b64 exec, exec, s[0:1]
2086; GFX1064-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D
2087; GFX1064-NEXT:    s_waitcnt vmcnt(0)
2088; GFX1064-NEXT:    ; return to shader part epilog
2089main_body:
2090  %inst23 = extractelement <2 x float> %pos, i32 0
2091  %inst24 = extractelement <2 x float> %pos, i32 1
2092  %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
2093  %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
2094  %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
2095  %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
2096  %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0)
2097  ret <4 x float> %tex
2098}
2099
2100define amdgpu_ps float @test_wqm2(i32 inreg %idx0, i32 inreg %idx1) #0 {
2101; GFX1032-LABEL: test_wqm2:
2102; GFX1032:       ; %bb.0: ; %main_body
2103; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
2104; GFX1032-NEXT:    s_wqm_b32 exec_lo, exec_lo
2105; GFX1032-NEXT:    v_mov_b32_e32 v0, s0
2106; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
2107; GFX1032-NEXT:    s_clause 0x1
2108; GFX1032-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
2109; GFX1032-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 idxen
2110; GFX1032-NEXT:    s_waitcnt vmcnt(0)
2111; GFX1032-NEXT:    v_add_f32_e32 v0, v2, v3
2112; GFX1032-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
2113; GFX1032-NEXT:    s_and_b32 exec_lo, exec_lo, s2
2114; GFX1032-NEXT:    ; return to shader part epilog
2115;
2116; GFX1064-LABEL: test_wqm2:
2117; GFX1064:       ; %bb.0: ; %main_body
2118; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
2119; GFX1064-NEXT:    s_wqm_b64 exec, exec
2120; GFX1064-NEXT:    v_mov_b32_e32 v0, s0
2121; GFX1064-NEXT:    v_mov_b32_e32 v1, s1
2122; GFX1064-NEXT:    s_clause 0x1
2123; GFX1064-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
2124; GFX1064-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 idxen
2125; GFX1064-NEXT:    s_waitcnt vmcnt(0)
2126; GFX1064-NEXT:    v_add_f32_e32 v0, v2, v3
2127; GFX1064-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
2128; GFX1064-NEXT:    s_and_b64 exec, exec, s[2:3]
2129; GFX1064-NEXT:    ; return to shader part epilog
2130main_body:
2131  %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
2132  %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
2133  %out = fadd float %src0, %src1
2134  %out.0 = bitcast float %out to i32
2135  %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
2136  %out.2 = bitcast i32 %out.1 to float
2137  ret float %out.2
2138}
2139
2140define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, float %a) {
2141; GFX1032-LABEL: test_intr_fcmp_i64:
2142; GFX1032:       ; %bb.0:
2143; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2144; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2145; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2146; GFX1032-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
2147; GFX1032-NEXT:    v_mov_b32_e32 v0, s2
2148; GFX1032-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
2149; GFX1032-NEXT:    s_endpgm
2150;
2151; GFX1064-LABEL: test_intr_fcmp_i64:
2152; GFX1064:       ; %bb.0:
2153; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2154; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
2155; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2156; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |s3|
2157; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
2158; GFX1064-NEXT:    v_mov_b32_e32 v1, s3
2159; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
2160; GFX1064-NEXT:    s_endpgm
2161  %temp = call float @llvm.fabs.f32(float %a)
2162  %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1)
2163  store i64 %result, ptr addrspace(1) %out
2164  ret void
2165}
2166
2167define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) {
2168; GFX1032-LABEL: test_intr_icmp_i64:
2169; GFX1032:       ; %bb.0:
2170; GFX1032-NEXT:    s_clause 0x1
2171; GFX1032-NEXT:    s_load_dword s2, s[4:5], 0x2c
2172; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2173; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2174; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2175; GFX1032-NEXT:    v_cmp_eq_u32_e64 s2, 0x64, s2
2176; GFX1032-NEXT:    v_mov_b32_e32 v0, s2
2177; GFX1032-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
2178; GFX1032-NEXT:    s_endpgm
2179;
2180; GFX1064-LABEL: test_intr_icmp_i64:
2181; GFX1064:       ; %bb.0:
2182; GFX1064-NEXT:    s_clause 0x1
2183; GFX1064-NEXT:    s_load_dword s2, s[4:5], 0x2c
2184; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2185; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
2186; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2187; GFX1064-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0x64, s2
2188; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
2189; GFX1064-NEXT:    v_mov_b32_e32 v1, s3
2190; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
2191; GFX1064-NEXT:    s_endpgm
2192  %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %src, i32 100, i32 32)
2193  store i64 %result, ptr addrspace(1) %out
2194  ret void
2195}
2196
2197define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, float %a) {
2198; GFX1032-LABEL: test_intr_fcmp_i32:
2199; GFX1032:       ; %bb.0:
2200; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2201; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
2202; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2203; GFX1032-NEXT:    v_cmp_eq_f32_e64 s2, s2, |s3|
2204; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
2205; GFX1032-NEXT:    global_store_dword v0, v1, s[0:1]
2206; GFX1032-NEXT:    s_endpgm
2207;
2208; GFX1064-LABEL: test_intr_fcmp_i32:
2209; GFX1064:       ; %bb.0:
2210; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2211; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2212; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2213; GFX1064-NEXT:    v_cmp_eq_f32_e64 s[2:3], s2, |s3|
2214; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
2215; GFX1064-NEXT:    global_store_dword v0, v1, s[0:1]
2216; GFX1064-NEXT:    s_endpgm
2217  %temp = call float @llvm.fabs.f32(float %a)
2218  %result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1)
2219  store i32 %result, ptr addrspace(1) %out
2220  ret void
2221}
2222
2223define amdgpu_kernel void @test_intr_icmp_i32(ptr addrspace(1) %out, i32 %src) {
2224; GFX1032-LABEL: test_intr_icmp_i32:
2225; GFX1032:       ; %bb.0:
2226; GFX1032-NEXT:    s_clause 0x1
2227; GFX1032-NEXT:    s_load_dword s2, s[4:5], 0x2c
2228; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2229; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
2230; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2231; GFX1032-NEXT:    v_cmp_eq_u32_e64 s2, 0x64, s2
2232; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
2233; GFX1032-NEXT:    global_store_dword v0, v1, s[0:1]
2234; GFX1032-NEXT:    s_endpgm
2235;
2236; GFX1064-LABEL: test_intr_icmp_i32:
2237; GFX1064:       ; %bb.0:
2238; GFX1064-NEXT:    s_clause 0x1
2239; GFX1064-NEXT:    s_load_dword s2, s[4:5], 0x2c
2240; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2241; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2242; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2243; GFX1064-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0x64, s2
2244; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
2245; GFX1064-NEXT:    global_store_dword v0, v1, s[0:1]
2246; GFX1064-NEXT:    s_endpgm
2247  %result = call i32 @llvm.amdgcn.icmp.i32.i32(i32 %src, i32 100, i32 32)
2248  store i32 %result, ptr addrspace(1) %out
2249  ret void
2250}
2251
2252define amdgpu_ps void @test_wqm_vote(float %a) {
2253; GFX1032-LABEL: test_wqm_vote:
2254; GFX1032:       ; %bb.0:
2255; GFX1032-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v0
2256; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
2257; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
2258; GFX1032-NEXT:    s_wqm_b32 s1, vcc_lo
2259; GFX1032-NEXT:    s_andn2_b32 s1, exec_lo, s1
2260; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
2261; GFX1032-NEXT:    s_cbranch_scc0 .LBB44_2
2262; GFX1032-NEXT:  ; %bb.1:
2263; GFX1032-NEXT:    s_and_b32 exec_lo, exec_lo, s0
2264; GFX1032-NEXT:    exp mrt0 off, off, off, off
2265; GFX1032-NEXT:    s_endpgm
2266; GFX1032-NEXT:  .LBB44_2:
2267; GFX1032-NEXT:    s_mov_b32 exec_lo, 0
2268; GFX1032-NEXT:    exp null off, off, off, off done vm
2269; GFX1032-NEXT:    s_endpgm
2270;
2271; GFX1064-LABEL: test_wqm_vote:
2272; GFX1064:       ; %bb.0:
2273; GFX1064-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
2274; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
2275; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
2276; GFX1064-NEXT:    s_wqm_b64 s[2:3], vcc
2277; GFX1064-NEXT:    s_andn2_b64 s[2:3], exec, s[2:3]
2278; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
2279; GFX1064-NEXT:    s_cbranch_scc0 .LBB44_2
2280; GFX1064-NEXT:  ; %bb.1:
2281; GFX1064-NEXT:    s_and_b64 exec, exec, s[0:1]
2282; GFX1064-NEXT:    exp mrt0 off, off, off, off
2283; GFX1064-NEXT:    s_endpgm
2284; GFX1064-NEXT:  .LBB44_2:
2285; GFX1064-NEXT:    s_mov_b64 exec, 0
2286; GFX1064-NEXT:    exp null off, off, off, off done vm
2287; GFX1064-NEXT:    s_endpgm
2288  %c1 = fcmp une float %a, 0.0
2289  %c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1)
2290  call void @llvm.amdgcn.kill(i1 %c2)
2291  call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false)
2292  ret void
2293}
2294
2295define amdgpu_kernel void @test_branch_true() #2 {
2296; GFX1032-LABEL: test_branch_true:
2297; GFX1032:       ; %bb.0: ; %entry
2298; GFX1032-NEXT:    s_mov_b32 vcc_lo, exec_lo
2299; GFX1032-NEXT:    s_cbranch_execnz .LBB45_2
2300; GFX1032-NEXT:  ; %bb.1: ; %for.body.lr.ph
2301; GFX1032-NEXT:    s_branch .LBB45_3
2302; GFX1032-NEXT:  .LBB45_2: ; %Flow
2303; GFX1032-NEXT:    s_branch .LBB45_5
2304; GFX1032-NEXT:  .LBB45_3: ; %for.body
2305; GFX1032-NEXT:    s_mov_b32 vcc_lo, 0
2306; GFX1032-NEXT:  ; %bb.4: ; %for.end.loopexit
2307; GFX1032-NEXT:    s_branch .LBB45_2
2308; GFX1032-NEXT:  .LBB45_5: ; %for.end
2309; GFX1032-NEXT:    s_endpgm
2310;
2311; GFX1064-LABEL: test_branch_true:
2312; GFX1064:       ; %bb.0: ; %entry
2313; GFX1064-NEXT:    s_mov_b64 vcc, exec
2314; GFX1064-NEXT:    s_cbranch_execnz .LBB45_2
2315; GFX1064-NEXT:  ; %bb.1: ; %for.body.lr.ph
2316; GFX1064-NEXT:    s_branch .LBB45_3
2317; GFX1064-NEXT:  .LBB45_2: ; %Flow
2318; GFX1064-NEXT:    s_branch .LBB45_5
2319; GFX1064-NEXT:  .LBB45_3: ; %for.body
2320; GFX1064-NEXT:    s_mov_b64 vcc, 0
2321; GFX1064-NEXT:  ; %bb.4: ; %for.end.loopexit
2322; GFX1064-NEXT:    s_branch .LBB45_2
2323; GFX1064-NEXT:  .LBB45_5: ; %for.end
2324; GFX1064-NEXT:    s_endpgm
2325entry:
2326  br i1 true, label %for.end, label %for.body.lr.ph
2327
2328for.body.lr.ph:                                   ; preds = %entry
2329  br label %for.body
2330
2331for.body:                                         ; preds = %for.body, %for.body.lr.ph
2332  br i1 undef, label %for.end, label %for.body
2333
2334for.end:                                          ; preds = %for.body, %entry
2335  ret void
2336}
2337
2338define amdgpu_ps float @test_ps_live() #0 {
2339; GFX1032-LABEL: test_ps_live:
2340; GFX1032:       ; %bb.0:
2341; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
2342; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
2343; GFX1032-NEXT:    ; return to shader part epilog
2344;
2345; GFX1064-LABEL: test_ps_live:
2346; GFX1064:       ; %bb.0:
2347; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
2348; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
2349; GFX1064-NEXT:    ; return to shader part epilog
2350  %live = call i1 @llvm.amdgcn.ps.live()
2351  %live.32 = zext i1 %live to i32
2352  %r = bitcast i32 %live.32 to float
2353  ret float %r
2354}
2355
2356define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2357; GFX1032-LABEL: test_vccnz_ifcvt_triangle64:
2358; GFX1032:       ; %bb.0: ; %entry
2359; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2360; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2361; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2362; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2363; GFX1032-NEXT:    v_cmp_neq_f64_e64 s4, s[2:3], 1.0
2364; GFX1032-NEXT:    s_and_b32 vcc_lo, exec_lo, s4
2365; GFX1032-NEXT:    s_cbranch_vccnz .LBB47_2
2366; GFX1032-NEXT:  ; %bb.1: ; %if
2367; GFX1032-NEXT:    v_add_f64 v[0:1], s[2:3], s[2:3]
2368; GFX1032-NEXT:    s_branch .LBB47_3
2369; GFX1032-NEXT:  .LBB47_2:
2370; GFX1032-NEXT:    v_mov_b32_e32 v0, s2
2371; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
2372; GFX1032-NEXT:  .LBB47_3: ; %endif
2373; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
2374; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
2375; GFX1032-NEXT:    s_endpgm
2376;
2377; GFX1064-LABEL: test_vccnz_ifcvt_triangle64:
2378; GFX1064:       ; %bb.0: ; %entry
2379; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2380; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2381; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2382; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2383; GFX1064-NEXT:    v_cmp_neq_f64_e64 s[4:5], s[2:3], 1.0
2384; GFX1064-NEXT:    s_and_b64 vcc, exec, s[4:5]
2385; GFX1064-NEXT:    s_cbranch_vccnz .LBB47_2
2386; GFX1064-NEXT:  ; %bb.1: ; %if
2387; GFX1064-NEXT:    v_add_f64 v[0:1], s[2:3], s[2:3]
2388; GFX1064-NEXT:    s_branch .LBB47_3
2389; GFX1064-NEXT:  .LBB47_2:
2390; GFX1064-NEXT:    v_mov_b32_e32 v0, s2
2391; GFX1064-NEXT:    v_mov_b32_e32 v1, s3
2392; GFX1064-NEXT:  .LBB47_3: ; %endif
2393; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
2394; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
2395; GFX1064-NEXT:    s_endpgm
2396entry:
2397  %v = load double, ptr addrspace(1) %in
2398  %cc = fcmp oeq double %v, 1.000000e+00
2399  br i1 %cc, label %if, label %endif
2400
2401if:
2402  %u = fadd double %v, %v
2403  br label %endif
2404
2405endif:
2406  %r = phi double [ %v, %entry ], [ %u, %if ]
2407  store double %r, ptr addrspace(1) %out
2408  ret void
2409}
2410
2411define amdgpu_gs float @test_vgprblocks_w32_attr(float %a, float %b, float %c, float %d, float %e,
2412; GCN-LABEL: test_vgprblocks_w32_attr:
2413; GCN:       ; %bb.0: ; %main_body
2414; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
2415; GCN-NEXT:    v_add_f32_e32 v0, v0, v2
2416; GCN-NEXT:    v_add_f32_e32 v0, v0, v3
2417; GCN-NEXT:    v_add_f32_e32 v0, v0, v4
2418; GCN-NEXT:    v_add_f32_e32 v0, v0, v5
2419; GCN-NEXT:    v_add_f32_e32 v0, v0, v6
2420; GCN-NEXT:    v_add_f32_e32 v0, v0, v7
2421; GCN-NEXT:    v_add_f32_e32 v0, v0, v8
2422; GCN-NEXT:    v_add_f32_e32 v0, v0, v9
2423; GCN-NEXT:    v_add_f32_e32 v0, v0, v10
2424; GCN-NEXT:    v_add_f32_e32 v0, v0, v11
2425; GCN-NEXT:    ; return to shader part epilog
2426                                        float %f, float %g, float %h, float %i, float %j, float %k, float %l) #3 {
2427main_body:
2428  %s = fadd float %a, %b
2429  %s.1 = fadd float %s, %c
2430  %s.2 = fadd float %s.1, %d
2431  %s.3 = fadd float %s.2, %e
2432  %s.4 = fadd float %s.3, %f
2433  %s.5 = fadd float %s.4, %g
2434  %s.6 = fadd float %s.5, %h
2435  %s.7 = fadd float %s.6, %i
2436  %s.8 = fadd float %s.7, %j
2437  %s.9 = fadd float %s.8, %k
2438  %s.10 = fadd float %s.9, %l
2439  ret float %s.10
2440}
2441
2442define amdgpu_gs float @test_vgprblocks_w64_attr(float %a, float %b, float %c, float %d, float %e,
2443; GCN-LABEL: test_vgprblocks_w64_attr:
2444; GCN:       ; %bb.0: ; %main_body
2445; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
2446; GCN-NEXT:    v_add_f32_e32 v0, v0, v2
2447; GCN-NEXT:    v_add_f32_e32 v0, v0, v3
2448; GCN-NEXT:    v_add_f32_e32 v0, v0, v4
2449; GCN-NEXT:    v_add_f32_e32 v0, v0, v5
2450; GCN-NEXT:    v_add_f32_e32 v0, v0, v6
2451; GCN-NEXT:    v_add_f32_e32 v0, v0, v7
2452; GCN-NEXT:    v_add_f32_e32 v0, v0, v8
2453; GCN-NEXT:    v_add_f32_e32 v0, v0, v9
2454; GCN-NEXT:    v_add_f32_e32 v0, v0, v10
2455; GCN-NEXT:    v_add_f32_e32 v0, v0, v11
2456; GCN-NEXT:    ; return to shader part epilog
2457                                        float %f, float %g, float %h, float %i, float %j, float %k, float %l) #4 {
2458main_body:
2459  %s = fadd float %a, %b
2460  %s.1 = fadd float %s, %c
2461  %s.2 = fadd float %s.1, %d
2462  %s.3 = fadd float %s.2, %e
2463  %s.4 = fadd float %s.3, %f
2464  %s.5 = fadd float %s.4, %g
2465  %s.6 = fadd float %s.5, %h
2466  %s.7 = fadd float %s.6, %i
2467  %s.8 = fadd float %s.7, %j
2468  %s.9 = fadd float %s.8, %k
2469  %s.10 = fadd float %s.9, %l
2470  ret float %s.10
2471}
2472
2473define amdgpu_kernel void @icmp64(i32 %n, i32 %s) {
2474; GFX1032-LABEL: icmp64:
2475; GFX1032:       ; %bb.0: ; %entry
2476; GFX1032-NEXT:    s_load_dword s0, s[4:5], 0x28
2477; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2478; GFX1032-NEXT:    v_cvt_f32_u32_e32 v1, s0
2479; GFX1032-NEXT:    s_sub_i32 s1, 0, s0
2480; GFX1032-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2481; GFX1032-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
2482; GFX1032-NEXT:    v_cvt_u32_f32_e32 v1, v1
2483; GFX1032-NEXT:    v_mul_lo_u32 v2, s1, v1
2484; GFX1032-NEXT:    s_brev_b32 s1, 1
2485; GFX1032-NEXT:    v_mul_hi_u32 v2, v1, v2
2486; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2487; GFX1032-NEXT:    v_mul_hi_u32 v1, v0, v1
2488; GFX1032-NEXT:    v_mul_lo_u32 v1, v1, s0
2489; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
2490; GFX1032-NEXT:    v_subrev_nc_u32_e32 v1, s0, v0
2491; GFX1032-NEXT:    v_cmp_le_u32_e32 vcc_lo, s0, v0
2492; GFX1032-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
2493; GFX1032-NEXT:    v_subrev_nc_u32_e32 v1, s0, v0
2494; GFX1032-NEXT:    v_cmp_le_u32_e32 vcc_lo, s0, v0
2495; GFX1032-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
2496; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2497; GFX1032-NEXT:    s_lshr_b32 s0, vcc_lo, 1
2498; GFX1032-NEXT:    s_ff1_i32_b64 s0, s[0:1]
2499; GFX1032-NEXT:    s_cmp_gt_u32 s0, 9
2500; GFX1032-NEXT:    s_cselect_b32 s0, -1, 0
2501; GFX1032-NEXT:    s_and_b32 s0, vcc_lo, s0
2502; GFX1032-NEXT:    s_and_saveexec_b32 s1, s0
2503; GFX1032-NEXT:  ; %bb.1: ; %if.then
2504; GFX1032-NEXT:    ; divergent unreachable
2505; GFX1032-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
2506; GFX1032-NEXT:    s_endpgm
2507;
2508; GFX1064-LABEL: icmp64:
2509; GFX1064:       ; %bb.0: ; %entry
2510; GFX1064-NEXT:    s_load_dword s0, s[4:5], 0x28
2511; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2512; GFX1064-NEXT:    v_cvt_f32_u32_e32 v1, s0
2513; GFX1064-NEXT:    s_sub_i32 s1, 0, s0
2514; GFX1064-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2515; GFX1064-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
2516; GFX1064-NEXT:    v_cvt_u32_f32_e32 v1, v1
2517; GFX1064-NEXT:    v_mul_lo_u32 v2, s1, v1
2518; GFX1064-NEXT:    v_mul_hi_u32 v2, v1, v2
2519; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2520; GFX1064-NEXT:    v_mul_hi_u32 v1, v0, v1
2521; GFX1064-NEXT:    v_mul_lo_u32 v1, v1, s0
2522; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
2523; GFX1064-NEXT:    v_subrev_nc_u32_e32 v1, s0, v0
2524; GFX1064-NEXT:    v_cmp_le_u32_e32 vcc, s0, v0
2525; GFX1064-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
2526; GFX1064-NEXT:    v_subrev_nc_u32_e32 v1, s0, v0
2527; GFX1064-NEXT:    v_cmp_le_u32_e32 vcc, s0, v0
2528; GFX1064-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
2529; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2530; GFX1064-NEXT:    s_lshr_b64 s[0:1], vcc, 1
2531; GFX1064-NEXT:    s_bitset1_b32 s1, 31
2532; GFX1064-NEXT:    s_ff1_i32_b64 s0, s[0:1]
2533; GFX1064-NEXT:    s_cmp_gt_u32 s0, 9
2534; GFX1064-NEXT:    s_cselect_b64 s[0:1], -1, 0
2535; GFX1064-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
2536; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
2537; GFX1064-NEXT:  ; %bb.1: ; %if.then
2538; GFX1064-NEXT:    ; divergent unreachable
2539; GFX1064-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
2540; GFX1064-NEXT:    s_endpgm
2541entry:
2542  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
2543  %mul4 = mul nsw i32 %s, %n
2544  %cmp = icmp slt i32 0, %mul4
2545  br label %if.end
2546
2547if.end:                                           ; preds = %entry
2548  %rem = urem i32 %id, %s
2549  %icmp = tail call i64 @llvm.amdgcn.icmp.i64.i32(i32 %rem, i32 0, i32 32)
2550  %shr = lshr i64 %icmp, 1
2551  %notmask = shl nsw i64 -1, 0
2552  %and = and i64 %notmask, %shr
2553  %or = or i64 %and, -9223372036854775808
2554  %cttz = tail call i64 @llvm.cttz.i64(i64 %or, i1 true)
2555  %cast = trunc i64 %cttz to i32
2556  %cmp3 = icmp ugt i32 10, %cast
2557  %cmp6 = icmp ne i32 %rem, 0
2558  %brmerge = or i1 %cmp6, %cmp3
2559  br i1 %brmerge, label %if.end2, label %if.then
2560
2561if.then:                                          ; preds = %if.end
2562  unreachable
2563
2564if.end2:                                          ; preds = %if.end
2565  ret void
2566}
2567
2568define amdgpu_kernel void @fcmp64(float %n, float %s) {
2569; GFX1032-LABEL: fcmp64:
2570; GFX1032:       ; %bb.0: ; %entry
2571; GFX1032-NEXT:    s_load_dword s0, s[4:5], 0x28
2572; GFX1032-NEXT:    v_cvt_f32_u32_e32 v0, v0
2573; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2574; GFX1032-NEXT:    v_div_scale_f32 v1, s1, s0, s0, v0
2575; GFX1032-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, s0, v0
2576; GFX1032-NEXT:    s_brev_b32 s1, 1
2577; GFX1032-NEXT:    v_rcp_f32_e32 v2, v1
2578; GFX1032-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
2579; GFX1032-NEXT:    v_fmac_f32_e32 v2, v3, v2
2580; GFX1032-NEXT:    v_mul_f32_e32 v3, v4, v2
2581; GFX1032-NEXT:    v_fma_f32 v5, -v1, v3, v4
2582; GFX1032-NEXT:    v_fmac_f32_e32 v3, v5, v2
2583; GFX1032-NEXT:    v_fma_f32 v1, -v1, v3, v4
2584; GFX1032-NEXT:    v_div_fmas_f32 v1, v1, v2, v3
2585; GFX1032-NEXT:    v_div_fixup_f32 v1, v1, s0, v0
2586; GFX1032-NEXT:    v_trunc_f32_e32 v1, v1
2587; GFX1032-NEXT:    v_fma_f32 v0, -v1, s0, v0
2588; GFX1032-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
2589; GFX1032-NEXT:    s_lshr_b32 s0, vcc_lo, 1
2590; GFX1032-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, 0, v0
2591; GFX1032-NEXT:    s_ff1_i32_b64 s0, s[0:1]
2592; GFX1032-NEXT:    s_cmp_gt_u32 s0, 9
2593; GFX1032-NEXT:    s_cselect_b32 s0, -1, 0
2594; GFX1032-NEXT:    s_and_b32 s0, vcc_lo, s0
2595; GFX1032-NEXT:    s_and_saveexec_b32 s1, s0
2596; GFX1032-NEXT:  ; %bb.1: ; %if.then
2597; GFX1032-NEXT:    ; divergent unreachable
2598; GFX1032-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
2599; GFX1032-NEXT:    s_endpgm
2600;
2601; GFX1064-LABEL: fcmp64:
2602; GFX1064:       ; %bb.0: ; %entry
2603; GFX1064-NEXT:    s_load_dword s2, s[4:5], 0x28
2604; GFX1064-NEXT:    v_cvt_f32_u32_e32 v0, v0
2605; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2606; GFX1064-NEXT:    v_div_scale_f32 v1, s[0:1], s2, s2, v0
2607; GFX1064-NEXT:    v_rcp_f32_e32 v2, v1
2608; GFX1064-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
2609; GFX1064-NEXT:    v_fmac_f32_e32 v2, v3, v2
2610; GFX1064-NEXT:    v_div_scale_f32 v3, vcc, v0, s2, v0
2611; GFX1064-NEXT:    v_mul_f32_e32 v4, v3, v2
2612; GFX1064-NEXT:    v_fma_f32 v5, -v1, v4, v3
2613; GFX1064-NEXT:    v_fmac_f32_e32 v4, v5, v2
2614; GFX1064-NEXT:    v_fma_f32 v1, -v1, v4, v3
2615; GFX1064-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
2616; GFX1064-NEXT:    v_div_fixup_f32 v1, v1, s2, v0
2617; GFX1064-NEXT:    v_trunc_f32_e32 v1, v1
2618; GFX1064-NEXT:    v_fma_f32 v0, -v1, s2, v0
2619; GFX1064-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v0
2620; GFX1064-NEXT:    s_lshr_b64 s[0:1], vcc, 1
2621; GFX1064-NEXT:    v_cmp_nlg_f32_e32 vcc, 0, v0
2622; GFX1064-NEXT:    s_bitset1_b32 s1, 31
2623; GFX1064-NEXT:    s_ff1_i32_b64 s0, s[0:1]
2624; GFX1064-NEXT:    s_cmp_gt_u32 s0, 9
2625; GFX1064-NEXT:    s_cselect_b64 s[0:1], -1, 0
2626; GFX1064-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
2627; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
2628; GFX1064-NEXT:  ; %bb.1: ; %if.then
2629; GFX1064-NEXT:    ; divergent unreachable
2630; GFX1064-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
2631; GFX1064-NEXT:    s_endpgm
2632entry:
2633  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
2634  %id.f = uitofp i32 %id to float
2635  %mul4 = fmul float %s, %n
2636  %cmp = fcmp ult float 0.0, %mul4
2637  br label %if.end
2638
2639if.end:                                           ; preds = %entry
2640  %rem.f = frem float %id.f, %s
2641  %fcmp = tail call i64 @llvm.amdgcn.fcmp.i64.f32(float %rem.f, float 0.0, i32 1)
2642  %shr = lshr i64 %fcmp, 1
2643  %notmask = shl nsw i64 -1, 0
2644  %and = and i64 %notmask, %shr
2645  %or = or i64 %and, -9223372036854775808
2646  %cttz = tail call i64 @llvm.cttz.i64(i64 %or, i1 true)
2647  %cast = trunc i64 %cttz to i32
2648  %cmp3 = icmp ugt i32 10, %cast
2649  %cmp6 = fcmp one float %rem.f, 0.0
2650  %brmerge = or i1 %cmp6, %cmp3
2651  br i1 %brmerge, label %if.end2, label %if.then
2652
2653if.then:                                          ; preds = %if.end
2654  unreachable
2655
2656if.end2:                                          ; preds = %if.end
2657  ret void
2658}
2659
2660define amdgpu_kernel void @icmp32(i32 %n, i32 %s) {
2661; GFX1032-LABEL: icmp32:
2662; GFX1032:       ; %bb.0: ; %entry
2663; GFX1032-NEXT:    s_load_dword s0, s[4:5], 0x28
2664; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2665; GFX1032-NEXT:    v_cvt_f32_u32_e32 v1, s0
2666; GFX1032-NEXT:    s_sub_i32 s1, 0, s0
2667; GFX1032-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2668; GFX1032-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
2669; GFX1032-NEXT:    v_cvt_u32_f32_e32 v1, v1
2670; GFX1032-NEXT:    v_mul_lo_u32 v2, s1, v1
2671; GFX1032-NEXT:    v_mul_hi_u32 v2, v1, v2
2672; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2673; GFX1032-NEXT:    v_mul_hi_u32 v1, v0, v1
2674; GFX1032-NEXT:    v_mul_lo_u32 v1, v1, s0
2675; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
2676; GFX1032-NEXT:    v_subrev_nc_u32_e32 v1, s0, v0
2677; GFX1032-NEXT:    v_cmp_le_u32_e32 vcc_lo, s0, v0
2678; GFX1032-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
2679; GFX1032-NEXT:    v_subrev_nc_u32_e32 v1, s0, v0
2680; GFX1032-NEXT:    v_cmp_le_u32_e32 vcc_lo, s0, v0
2681; GFX1032-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
2682; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2683; GFX1032-NEXT:    s_lshr_b32 s0, vcc_lo, 1
2684; GFX1032-NEXT:    s_bitset1_b32 s0, 31
2685; GFX1032-NEXT:    s_ff1_i32_b32 s0, s0
2686; GFX1032-NEXT:    s_cmp_gt_u32 s0, 9
2687; GFX1032-NEXT:    s_cselect_b32 s0, -1, 0
2688; GFX1032-NEXT:    s_and_b32 s0, vcc_lo, s0
2689; GFX1032-NEXT:    s_and_saveexec_b32 s1, s0
2690; GFX1032-NEXT:  ; %bb.1: ; %if.then
2691; GFX1032-NEXT:    ; divergent unreachable
2692; GFX1032-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
2693; GFX1032-NEXT:    s_endpgm
2694;
2695; GFX1064-LABEL: icmp32:
2696; GFX1064:       ; %bb.0: ; %entry
2697; GFX1064-NEXT:    s_load_dword s0, s[4:5], 0x28
2698; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2699; GFX1064-NEXT:    v_cvt_f32_u32_e32 v1, s0
2700; GFX1064-NEXT:    s_sub_i32 s1, 0, s0
2701; GFX1064-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2702; GFX1064-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
2703; GFX1064-NEXT:    v_cvt_u32_f32_e32 v1, v1
2704; GFX1064-NEXT:    v_mul_lo_u32 v2, s1, v1
2705; GFX1064-NEXT:    v_mul_hi_u32 v2, v1, v2
2706; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2707; GFX1064-NEXT:    v_mul_hi_u32 v1, v0, v1
2708; GFX1064-NEXT:    v_mul_lo_u32 v1, v1, s0
2709; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
2710; GFX1064-NEXT:    v_subrev_nc_u32_e32 v1, s0, v0
2711; GFX1064-NEXT:    v_cmp_le_u32_e32 vcc, s0, v0
2712; GFX1064-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
2713; GFX1064-NEXT:    v_subrev_nc_u32_e32 v1, s0, v0
2714; GFX1064-NEXT:    v_cmp_le_u32_e32 vcc, s0, v0
2715; GFX1064-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
2716; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2717; GFX1064-NEXT:    s_lshr_b32 s0, vcc_lo, 1
2718; GFX1064-NEXT:    s_bitset1_b32 s0, 31
2719; GFX1064-NEXT:    s_ff1_i32_b32 s0, s0
2720; GFX1064-NEXT:    s_cmp_gt_u32 s0, 9
2721; GFX1064-NEXT:    s_cselect_b64 s[0:1], -1, 0
2722; GFX1064-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
2723; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
2724; GFX1064-NEXT:  ; %bb.1: ; %if.then
2725; GFX1064-NEXT:    ; divergent unreachable
2726; GFX1064-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
2727; GFX1064-NEXT:    s_endpgm
2728entry:
2729  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
2730  %mul4 = mul nsw i32 %s, %n
2731  %cmp = icmp slt i32 0, %mul4
2732  br label %if.end
2733
2734if.end:                                           ; preds = %entry
2735  %rem = urem i32 %id, %s
2736  %icmp = tail call i32 @llvm.amdgcn.icmp.i32.i32(i32 %rem, i32 0, i32 32)
2737  %shr = lshr i32 %icmp, 1
2738  %notmask = shl nsw i32 -1, 0
2739  %and = and i32 %notmask, %shr
2740  %or = or i32 %and, 2147483648
2741  %cttz = tail call i32 @llvm.cttz.i32(i32 %or, i1 true)
2742  %cmp3 = icmp ugt i32 10, %cttz
2743  %cmp6 = icmp ne i32 %rem, 0
2744  %brmerge = or i1 %cmp6, %cmp3
2745  br i1 %brmerge, label %if.end2, label %if.then
2746
2747if.then:                                          ; preds = %if.end
2748  unreachable
2749
2750if.end2:                                          ; preds = %if.end
2751  ret void
2752}
2753
2754define amdgpu_kernel void @fcmp32(float %n, float %s) {
2755; GFX1032-LABEL: fcmp32:
2756; GFX1032:       ; %bb.0: ; %entry
2757; GFX1032-NEXT:    s_load_dword s0, s[4:5], 0x28
2758; GFX1032-NEXT:    v_cvt_f32_u32_e32 v0, v0
2759; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2760; GFX1032-NEXT:    v_div_scale_f32 v1, s1, s0, s0, v0
2761; GFX1032-NEXT:    v_rcp_f32_e32 v2, v1
2762; GFX1032-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
2763; GFX1032-NEXT:    v_fmac_f32_e32 v2, v3, v2
2764; GFX1032-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, s0, v0
2765; GFX1032-NEXT:    v_mul_f32_e32 v4, v3, v2
2766; GFX1032-NEXT:    v_fma_f32 v5, -v1, v4, v3
2767; GFX1032-NEXT:    v_fmac_f32_e32 v4, v5, v2
2768; GFX1032-NEXT:    v_fma_f32 v1, -v1, v4, v3
2769; GFX1032-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
2770; GFX1032-NEXT:    v_div_fixup_f32 v1, v1, s0, v0
2771; GFX1032-NEXT:    v_trunc_f32_e32 v1, v1
2772; GFX1032-NEXT:    v_fma_f32 v0, -v1, s0, v0
2773; GFX1032-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
2774; GFX1032-NEXT:    s_lshr_b32 s0, vcc_lo, 1
2775; GFX1032-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, 0, v0
2776; GFX1032-NEXT:    s_bitset1_b32 s0, 31
2777; GFX1032-NEXT:    s_ff1_i32_b32 s0, s0
2778; GFX1032-NEXT:    s_cmp_gt_u32 s0, 9
2779; GFX1032-NEXT:    s_cselect_b32 s0, -1, 0
2780; GFX1032-NEXT:    s_and_b32 s0, vcc_lo, s0
2781; GFX1032-NEXT:    s_and_saveexec_b32 s1, s0
2782; GFX1032-NEXT:  ; %bb.1: ; %if.then
2783; GFX1032-NEXT:    ; divergent unreachable
2784; GFX1032-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
2785; GFX1032-NEXT:    s_endpgm
2786;
2787; GFX1064-LABEL: fcmp32:
2788; GFX1064:       ; %bb.0: ; %entry
2789; GFX1064-NEXT:    s_load_dword s2, s[4:5], 0x28
2790; GFX1064-NEXT:    v_cvt_f32_u32_e32 v0, v0
2791; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2792; GFX1064-NEXT:    v_div_scale_f32 v1, s[0:1], s2, s2, v0
2793; GFX1064-NEXT:    v_rcp_f32_e32 v2, v1
2794; GFX1064-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
2795; GFX1064-NEXT:    v_fmac_f32_e32 v2, v3, v2
2796; GFX1064-NEXT:    v_div_scale_f32 v3, vcc, v0, s2, v0
2797; GFX1064-NEXT:    v_mul_f32_e32 v4, v3, v2
2798; GFX1064-NEXT:    v_fma_f32 v5, -v1, v4, v3
2799; GFX1064-NEXT:    v_fmac_f32_e32 v4, v5, v2
2800; GFX1064-NEXT:    v_fma_f32 v1, -v1, v4, v3
2801; GFX1064-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
2802; GFX1064-NEXT:    v_div_fixup_f32 v1, v1, s2, v0
2803; GFX1064-NEXT:    v_trunc_f32_e32 v1, v1
2804; GFX1064-NEXT:    v_fma_f32 v0, -v1, s2, v0
2805; GFX1064-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v0
2806; GFX1064-NEXT:    s_lshr_b32 s0, vcc_lo, 1
2807; GFX1064-NEXT:    v_cmp_nlg_f32_e32 vcc, 0, v0
2808; GFX1064-NEXT:    s_bitset1_b32 s0, 31
2809; GFX1064-NEXT:    s_ff1_i32_b32 s0, s0
2810; GFX1064-NEXT:    s_cmp_gt_u32 s0, 9
2811; GFX1064-NEXT:    s_cselect_b64 s[0:1], -1, 0
2812; GFX1064-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
2813; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
2814; GFX1064-NEXT:  ; %bb.1: ; %if.then
2815; GFX1064-NEXT:    ; divergent unreachable
2816; GFX1064-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
2817; GFX1064-NEXT:    s_endpgm
2818entry:
2819  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
2820  %id.f = uitofp i32 %id to float
2821  %mul4 = fmul float %s, %n
2822  %cmp = fcmp ult float 0.0, %mul4
2823  br label %if.end
2824
2825if.end:                                           ; preds = %entry
2826  %rem.f = frem float %id.f, %s
2827  %fcmp = tail call i32 @llvm.amdgcn.fcmp.i32.f32(float %rem.f, float 0.0, i32 1)
2828  %shr = lshr i32 %fcmp, 1
2829  %notmask = shl nsw i32 -1, 0
2830  %and = and i32 %notmask, %shr
2831  %or = or i32 %and, 2147483648
2832  %cttz = tail call i32 @llvm.cttz.i32(i32 %or, i1 true)
2833  %cmp3 = icmp ugt i32 10, %cttz
2834  %cmp6 = fcmp one float %rem.f, 0.0
2835  %brmerge = or i1 %cmp6, %cmp3
2836  br i1 %brmerge, label %if.end2, label %if.then
2837
2838if.then:                                          ; preds = %if.end
2839  unreachable
2840
2841if.end2:                                          ; preds = %if.end
2842  ret void
2843}
2844
2845declare void @external_void_func_void() #1
2846
2847define void @callee_no_stack_with_call() #1 {
2848; GFX1032-LABEL: callee_no_stack_with_call:
2849; GFX1032:       ; %bb.0:
2850; GFX1032-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2851; GFX1032-NEXT:    s_mov_b32 s16, s33
2852; GFX1032-NEXT:    s_mov_b32 s33, s32
2853; GFX1032-NEXT:    s_or_saveexec_b32 s17, -1
2854; GFX1032-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
2855; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2856; GFX1032-NEXT:    s_mov_b32 exec_lo, s17
2857; GFX1032-NEXT:    s_addk_i32 s32, 0x200
2858; GFX1032-NEXT:    v_writelane_b32 v40, s16, 2
2859; GFX1032-NEXT:    s_getpc_b64 s[16:17]
2860; GFX1032-NEXT:    s_add_u32 s16, s16, external_void_func_void@gotpcrel32@lo+4
2861; GFX1032-NEXT:    s_addc_u32 s17, s17, external_void_func_void@gotpcrel32@hi+12
2862; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
2863; GFX1032-NEXT:    v_writelane_b32 v40, s30, 0
2864; GFX1032-NEXT:    v_writelane_b32 v40, s31, 1
2865; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2866; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
2867; GFX1032-NEXT:    v_readlane_b32 s31, v40, 1
2868; GFX1032-NEXT:    v_readlane_b32 s30, v40, 0
2869; GFX1032-NEXT:    s_mov_b32 s32, s33
2870; GFX1032-NEXT:    v_readlane_b32 s4, v40, 2
2871; GFX1032-NEXT:    s_or_saveexec_b32 s5, -1
2872; GFX1032-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
2873; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2874; GFX1032-NEXT:    s_mov_b32 exec_lo, s5
2875; GFX1032-NEXT:    s_mov_b32 s33, s4
2876; GFX1032-NEXT:    s_waitcnt vmcnt(0)
2877; GFX1032-NEXT:    s_setpc_b64 s[30:31]
2878;
2879; GFX1064-LABEL: callee_no_stack_with_call:
2880; GFX1064:       ; %bb.0:
2881; GFX1064-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2882; GFX1064-NEXT:    s_mov_b32 s16, s33
2883; GFX1064-NEXT:    s_mov_b32 s33, s32
2884; GFX1064-NEXT:    s_or_saveexec_b64 s[18:19], -1
2885; GFX1064-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
2886; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2887; GFX1064-NEXT:    s_mov_b64 exec, s[18:19]
2888; GFX1064-NEXT:    s_addk_i32 s32, 0x400
2889; GFX1064-NEXT:    v_writelane_b32 v40, s16, 2
2890; GFX1064-NEXT:    s_getpc_b64 s[16:17]
2891; GFX1064-NEXT:    s_add_u32 s16, s16, external_void_func_void@gotpcrel32@lo+4
2892; GFX1064-NEXT:    s_addc_u32 s17, s17, external_void_func_void@gotpcrel32@hi+12
2893; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
2894; GFX1064-NEXT:    v_writelane_b32 v40, s30, 0
2895; GFX1064-NEXT:    v_writelane_b32 v40, s31, 1
2896; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2897; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
2898; GFX1064-NEXT:    v_readlane_b32 s31, v40, 1
2899; GFX1064-NEXT:    v_readlane_b32 s30, v40, 0
2900; GFX1064-NEXT:    s_mov_b32 s32, s33
2901; GFX1064-NEXT:    v_readlane_b32 s4, v40, 2
2902; GFX1064-NEXT:    s_or_saveexec_b64 s[6:7], -1
2903; GFX1064-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
2904; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2905; GFX1064-NEXT:    s_mov_b64 exec, s[6:7]
2906; GFX1064-NEXT:    s_mov_b32 s33, s4
2907; GFX1064-NEXT:    s_waitcnt vmcnt(0)
2908; GFX1064-NEXT:    s_setpc_b64 s[30:31]
2909  call void @external_void_func_void()
2910  ret void
2911}
2912
2913
2914declare i32 @llvm.amdgcn.workitem.id.x()
2915declare float @llvm.fabs.f32(float)
2916declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1)
2917declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1)
2918declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1)
2919declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1)
2920declare i1 @llvm.amdgcn.class.f32(float, i32)
2921declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32)
2922declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64)
2923declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32)
2924declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)
2925declare float @llvm.amdgcn.strict.wwm.f32(float)
2926declare i32 @llvm.amdgcn.strict.wwm.i32(i32)
2927declare i64 @llvm.amdgcn.strict.wwm.i64(i64)
2928declare float @llvm.amdgcn.wwm.f32(float)
2929declare i32 @llvm.amdgcn.wqm.i32(i32)
2930declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32)
2931declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32)
2932declare float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32 immarg)
2933declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32)
2934declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32)
2935declare i64 @llvm.amdgcn.fcmp.i64.f32(float, float, i32)
2936declare i64 @llvm.amdgcn.icmp.i64.i32(i32, i32, i32)
2937declare i32 @llvm.amdgcn.fcmp.i32.f32(float, float, i32)
2938declare i32 @llvm.amdgcn.icmp.i32.i32(i32, i32, i32)
2939declare void @llvm.amdgcn.kill(i1)
2940declare i1 @llvm.amdgcn.wqm.vote(i1)
2941declare i1 @llvm.amdgcn.ps.live()
2942declare i64 @llvm.cttz.i64(i64, i1)
2943declare i32 @llvm.cttz.i32(i32, i1)
2944declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #5
2945
2946attributes #0 = { nounwind readnone speculatable }
2947attributes #1 = { nounwind }
2948attributes #2 = { nounwind readnone optnone noinline }
2949attributes #3 = { "target-features"="+wavefrontsize32" }
2950attributes #4 = { "target-features"="+wavefrontsize64" }
2951attributes #5 = { inaccessiblememonly nounwind }
2952;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
2953; GFX10DEFWAVE: {{.*}}
2954