xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll (revision 40fa7f5e8b315159d45aa280c771af5998bdc75e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
3; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
4; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-32 %s
5; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-64 %s
6
7define amdgpu_ps void @static_exact(float %arg0, float %arg1) {
8; SI-LABEL: static_exact:
9; SI:       ; %bb.0: ; %.entry
10; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
11; SI-NEXT:    s_andn2_b64 exec, exec, exec
12; SI-NEXT:    s_cbranch_scc0 .LBB0_2
13; SI-NEXT:  ; %bb.1: ; %.entry
14; SI-NEXT:    s_mov_b64 exec, 0
15; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
16; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
17; SI-NEXT:    s_endpgm
18; SI-NEXT:  .LBB0_2:
19; SI-NEXT:    s_mov_b64 exec, 0
20; SI-NEXT:    exp null off, off, off, off done vm
21; SI-NEXT:    s_endpgm
22;
23; GFX9-LABEL: static_exact:
24; GFX9:       ; %bb.0: ; %.entry
25; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
26; GFX9-NEXT:    s_andn2_b64 exec, exec, exec
27; GFX9-NEXT:    s_cbranch_scc0 .LBB0_2
28; GFX9-NEXT:  ; %bb.1: ; %.entry
29; GFX9-NEXT:    s_mov_b64 exec, 0
30; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
31; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
32; GFX9-NEXT:    s_endpgm
33; GFX9-NEXT:  .LBB0_2:
34; GFX9-NEXT:    s_mov_b64 exec, 0
35; GFX9-NEXT:    exp null off, off, off, off done vm
36; GFX9-NEXT:    s_endpgm
37;
38; GFX10-32-LABEL: static_exact:
39; GFX10-32:       ; %bb.0: ; %.entry
40; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
41; GFX10-32-NEXT:    s_andn2_b32 exec_lo, exec_lo, exec_lo
42; GFX10-32-NEXT:    s_cbranch_scc0 .LBB0_2
43; GFX10-32-NEXT:  ; %bb.1: ; %.entry
44; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
45; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
46; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
47; GFX10-32-NEXT:    s_endpgm
48; GFX10-32-NEXT:  .LBB0_2:
49; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
50; GFX10-32-NEXT:    exp null off, off, off, off done vm
51; GFX10-32-NEXT:    s_endpgm
52;
53; GFX10-64-LABEL: static_exact:
54; GFX10-64:       ; %bb.0: ; %.entry
55; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
56; GFX10-64-NEXT:    s_andn2_b64 exec, exec, exec
57; GFX10-64-NEXT:    s_cbranch_scc0 .LBB0_2
58; GFX10-64-NEXT:  ; %bb.1: ; %.entry
59; GFX10-64-NEXT:    s_mov_b64 exec, 0
60; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
61; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
62; GFX10-64-NEXT:    s_endpgm
63; GFX10-64-NEXT:  .LBB0_2:
64; GFX10-64-NEXT:    s_mov_b64 exec, 0
65; GFX10-64-NEXT:    exp null off, off, off, off done vm
66; GFX10-64-NEXT:    s_endpgm
67.entry:
68  %c0 = fcmp olt float %arg0, 0.000000e+00
69  %c1 = fcmp oge float %arg1, 0.0
70  call void @llvm.amdgcn.wqm.demote(i1 false)
71  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
72  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
73  ret void
74}
75
76define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
77; SI-LABEL: dynamic_exact:
78; SI:       ; %bb.0: ; %.entry
79; SI-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
80; SI-NEXT:    s_mov_b64 s[2:3], exec
81; SI-NEXT:    s_andn2_b64 s[0:1], exec, s[0:1]
82; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
83; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
84; SI-NEXT:    s_cbranch_scc0 .LBB1_2
85; SI-NEXT:  ; %bb.1: ; %.entry
86; SI-NEXT:    s_and_b64 exec, exec, s[2:3]
87; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
88; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
89; SI-NEXT:    s_endpgm
90; SI-NEXT:  .LBB1_2:
91; SI-NEXT:    s_mov_b64 exec, 0
92; SI-NEXT:    exp null off, off, off, off done vm
93; SI-NEXT:    s_endpgm
94;
95; GFX9-LABEL: dynamic_exact:
96; GFX9:       ; %bb.0: ; %.entry
97; GFX9-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
98; GFX9-NEXT:    s_mov_b64 s[2:3], exec
99; GFX9-NEXT:    s_andn2_b64 s[0:1], exec, s[0:1]
100; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
101; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
102; GFX9-NEXT:    s_cbranch_scc0 .LBB1_2
103; GFX9-NEXT:  ; %bb.1: ; %.entry
104; GFX9-NEXT:    s_and_b64 exec, exec, s[2:3]
105; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
106; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
107; GFX9-NEXT:    s_endpgm
108; GFX9-NEXT:  .LBB1_2:
109; GFX9-NEXT:    s_mov_b64 exec, 0
110; GFX9-NEXT:    exp null off, off, off, off done vm
111; GFX9-NEXT:    s_endpgm
112;
113; GFX10-32-LABEL: dynamic_exact:
114; GFX10-32:       ; %bb.0: ; %.entry
115; GFX10-32-NEXT:    v_cmp_le_f32_e64 s0, 0, v1
116; GFX10-32-NEXT:    s_mov_b32 s1, exec_lo
117; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
118; GFX10-32-NEXT:    s_andn2_b32 s0, exec_lo, s0
119; GFX10-32-NEXT:    s_andn2_b32 s1, s1, s0
120; GFX10-32-NEXT:    s_cbranch_scc0 .LBB1_2
121; GFX10-32-NEXT:  ; %bb.1: ; %.entry
122; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
123; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
124; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
125; GFX10-32-NEXT:    s_endpgm
126; GFX10-32-NEXT:  .LBB1_2:
127; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
128; GFX10-32-NEXT:    exp null off, off, off, off done vm
129; GFX10-32-NEXT:    s_endpgm
130;
131; GFX10-64-LABEL: dynamic_exact:
132; GFX10-64:       ; %bb.0: ; %.entry
133; GFX10-64-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
134; GFX10-64-NEXT:    s_mov_b64 s[2:3], exec
135; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
136; GFX10-64-NEXT:    s_andn2_b64 s[0:1], exec, s[0:1]
137; GFX10-64-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
138; GFX10-64-NEXT:    s_cbranch_scc0 .LBB1_2
139; GFX10-64-NEXT:  ; %bb.1: ; %.entry
140; GFX10-64-NEXT:    s_and_b64 exec, exec, s[2:3]
141; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
142; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
143; GFX10-64-NEXT:    s_endpgm
144; GFX10-64-NEXT:  .LBB1_2:
145; GFX10-64-NEXT:    s_mov_b64 exec, 0
146; GFX10-64-NEXT:    exp null off, off, off, off done vm
147; GFX10-64-NEXT:    s_endpgm
148.entry:
149  %c0 = fcmp olt float %arg0, 0.000000e+00
150  %c1 = fcmp oge float %arg1, 0.0
151  call void @llvm.amdgcn.wqm.demote(i1 %c1)
152  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
153  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
154  ret void
155}
156
157define amdgpu_ps void @branch(float %arg0, float %arg1) {
158; SI-LABEL: branch:
159; SI:       ; %bb.0: ; %.entry
160; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
161; SI-NEXT:    v_cvt_i32_f32_e32 v1, v1
162; SI-NEXT:    s_mov_b64 s[0:1], exec
163; SI-NEXT:    v_or_b32_e32 v0, v0, v1
164; SI-NEXT:    v_and_b32_e32 v0, 1, v0
165; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
166; SI-NEXT:    s_xor_b64 s[2:3], vcc, -1
167; SI-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
168; SI-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
169; SI-NEXT:    s_cbranch_execz .LBB2_3
170; SI-NEXT:  ; %bb.1: ; %.demote
171; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
172; SI-NEXT:    s_cbranch_scc0 .LBB2_4
173; SI-NEXT:  ; %bb.2: ; %.demote
174; SI-NEXT:    s_mov_b64 exec, 0
175; SI-NEXT:  .LBB2_3: ; %.continue
176; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
177; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
178; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
179; SI-NEXT:    s_endpgm
180; SI-NEXT:  .LBB2_4:
181; SI-NEXT:    s_mov_b64 exec, 0
182; SI-NEXT:    exp null off, off, off, off done vm
183; SI-NEXT:    s_endpgm
184;
185; GFX9-LABEL: branch:
186; GFX9:       ; %bb.0: ; %.entry
187; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
188; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
189; GFX9-NEXT:    s_mov_b64 s[0:1], exec
190; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
191; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
192; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
193; GFX9-NEXT:    s_xor_b64 s[2:3], vcc, -1
194; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
195; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
196; GFX9-NEXT:    s_cbranch_execz .LBB2_3
197; GFX9-NEXT:  ; %bb.1: ; %.demote
198; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
199; GFX9-NEXT:    s_cbranch_scc0 .LBB2_4
200; GFX9-NEXT:  ; %bb.2: ; %.demote
201; GFX9-NEXT:    s_mov_b64 exec, 0
202; GFX9-NEXT:  .LBB2_3: ; %.continue
203; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
204; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
205; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
206; GFX9-NEXT:    s_endpgm
207; GFX9-NEXT:  .LBB2_4:
208; GFX9-NEXT:    s_mov_b64 exec, 0
209; GFX9-NEXT:    exp null off, off, off, off done vm
210; GFX9-NEXT:    s_endpgm
211;
212; GFX10-32-LABEL: branch:
213; GFX10-32:       ; %bb.0: ; %.entry
214; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
215; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v1, v1
216; GFX10-32-NEXT:    s_mov_b32 s0, exec_lo
217; GFX10-32-NEXT:    v_or_b32_e32 v0, v0, v1
218; GFX10-32-NEXT:    v_and_b32_e32 v0, 1, v0
219; GFX10-32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
220; GFX10-32-NEXT:    s_xor_b32 s1, vcc_lo, -1
221; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s1
222; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s2
223; GFX10-32-NEXT:    s_cbranch_execz .LBB2_3
224; GFX10-32-NEXT:  ; %bb.1: ; %.demote
225; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
226; GFX10-32-NEXT:    s_cbranch_scc0 .LBB2_4
227; GFX10-32-NEXT:  ; %bb.2: ; %.demote
228; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
229; GFX10-32-NEXT:  .LBB2_3: ; %.continue
230; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
231; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
232; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
233; GFX10-32-NEXT:    s_endpgm
234; GFX10-32-NEXT:  .LBB2_4:
235; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
236; GFX10-32-NEXT:    exp null off, off, off, off done vm
237; GFX10-32-NEXT:    s_endpgm
238;
239; GFX10-64-LABEL: branch:
240; GFX10-64:       ; %bb.0: ; %.entry
241; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
242; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v1, v1
243; GFX10-64-NEXT:    s_mov_b64 s[0:1], exec
244; GFX10-64-NEXT:    v_or_b32_e32 v0, v0, v1
245; GFX10-64-NEXT:    v_and_b32_e32 v0, 1, v0
246; GFX10-64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
247; GFX10-64-NEXT:    s_xor_b64 s[2:3], vcc, -1
248; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
249; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
250; GFX10-64-NEXT:    s_cbranch_execz .LBB2_3
251; GFX10-64-NEXT:  ; %bb.1: ; %.demote
252; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
253; GFX10-64-NEXT:    s_cbranch_scc0 .LBB2_4
254; GFX10-64-NEXT:  ; %bb.2: ; %.demote
255; GFX10-64-NEXT:    s_mov_b64 exec, 0
256; GFX10-64-NEXT:  .LBB2_3: ; %.continue
257; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
258; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
259; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
260; GFX10-64-NEXT:    s_endpgm
261; GFX10-64-NEXT:  .LBB2_4:
262; GFX10-64-NEXT:    s_mov_b64 exec, 0
263; GFX10-64-NEXT:    exp null off, off, off, off done vm
264; GFX10-64-NEXT:    s_endpgm
265.entry:
266  %i0 = fptosi float %arg0 to i32
267  %i1 = fptosi float %arg1 to i32
268  %c0 = or i32 %i0, %i1
269  %c1 = and i32 %c0, 1
270  %c2 = icmp eq i32 %c1, 0
271  br i1 %c2, label %.continue, label %.demote
272
273.demote:
274  call void @llvm.amdgcn.wqm.demote(i1 false)
275  br label %.continue
276
277.continue:
278  %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00
279  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
280  ret void
281}
282
283define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
284; SI-LABEL: wqm_demote_1:
285; SI:       ; %bb.0: ; %.entry
286; SI-NEXT:    s_mov_b64 s[12:13], exec
287; SI-NEXT:    s_wqm_b64 exec, exec
288; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
289; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
290; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
291; SI-NEXT:    s_cbranch_execz .LBB3_3
292; SI-NEXT:  ; %bb.1: ; %.demote
293; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
294; SI-NEXT:    s_cbranch_scc0 .LBB3_4
295; SI-NEXT:  ; %bb.2: ; %.demote
296; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
297; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
298; SI-NEXT:  .LBB3_3: ; %.continue
299; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
300; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
301; SI-NEXT:    s_waitcnt vmcnt(0)
302; SI-NEXT:    v_add_f32_e32 v0, v0, v0
303; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
304; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
305; SI-NEXT:    s_waitcnt vmcnt(0)
306; SI-NEXT:    s_branch .LBB3_5
307; SI-NEXT:  .LBB3_4:
308; SI-NEXT:    s_mov_b64 exec, 0
309; SI-NEXT:    exp null off, off, off, off done vm
310; SI-NEXT:    s_endpgm
311; SI-NEXT:  .LBB3_5:
312;
313; GFX9-LABEL: wqm_demote_1:
314; GFX9:       ; %bb.0: ; %.entry
315; GFX9-NEXT:    s_mov_b64 s[12:13], exec
316; GFX9-NEXT:    s_wqm_b64 exec, exec
317; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
318; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
319; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
320; GFX9-NEXT:    s_cbranch_execz .LBB3_3
321; GFX9-NEXT:  ; %bb.1: ; %.demote
322; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
323; GFX9-NEXT:    s_cbranch_scc0 .LBB3_4
324; GFX9-NEXT:  ; %bb.2: ; %.demote
325; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
326; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
327; GFX9-NEXT:  .LBB3_3: ; %.continue
328; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
329; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
330; GFX9-NEXT:    s_waitcnt vmcnt(0)
331; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
332; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
333; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
334; GFX9-NEXT:    s_waitcnt vmcnt(0)
335; GFX9-NEXT:    s_branch .LBB3_5
336; GFX9-NEXT:  .LBB3_4:
337; GFX9-NEXT:    s_mov_b64 exec, 0
338; GFX9-NEXT:    exp null off, off, off, off done vm
339; GFX9-NEXT:    s_endpgm
340; GFX9-NEXT:  .LBB3_5:
341;
342; GFX10-32-LABEL: wqm_demote_1:
343; GFX10-32:       ; %bb.0: ; %.entry
344; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
345; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
346; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v1
347; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
348; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
349; GFX10-32-NEXT:    s_cbranch_execz .LBB3_3
350; GFX10-32-NEXT:  ; %bb.1: ; %.demote
351; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
352; GFX10-32-NEXT:    s_cbranch_scc0 .LBB3_4
353; GFX10-32-NEXT:  ; %bb.2: ; %.demote
354; GFX10-32-NEXT:    s_wqm_b32 s14, s12
355; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s14
356; GFX10-32-NEXT:  .LBB3_3: ; %.continue
357; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
358; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
359; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
360; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
361; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
362; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
363; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
364; GFX10-32-NEXT:    s_branch .LBB3_5
365; GFX10-32-NEXT:  .LBB3_4:
366; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
367; GFX10-32-NEXT:    exp null off, off, off, off done vm
368; GFX10-32-NEXT:    s_endpgm
369; GFX10-32-NEXT:  .LBB3_5:
370;
371; GFX10-64-LABEL: wqm_demote_1:
372; GFX10-64:       ; %bb.0: ; %.entry
373; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
374; GFX10-64-NEXT:    s_wqm_b64 exec, exec
375; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
376; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
377; GFX10-64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
378; GFX10-64-NEXT:    s_cbranch_execz .LBB3_3
379; GFX10-64-NEXT:  ; %bb.1: ; %.demote
380; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
381; GFX10-64-NEXT:    s_cbranch_scc0 .LBB3_4
382; GFX10-64-NEXT:  ; %bb.2: ; %.demote
383; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
384; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
385; GFX10-64-NEXT:  .LBB3_3: ; %.continue
386; GFX10-64-NEXT:    s_or_b64 exec, exec, s[14:15]
387; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
388; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
389; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
390; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
391; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
392; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
393; GFX10-64-NEXT:    s_branch .LBB3_5
394; GFX10-64-NEXT:  .LBB3_4:
395; GFX10-64-NEXT:    s_mov_b64 exec, 0
396; GFX10-64-NEXT:    exp null off, off, off, off done vm
397; GFX10-64-NEXT:    s_endpgm
398; GFX10-64-NEXT:  .LBB3_5:
399.entry:
400  %z.cmp = fcmp olt float %z, 0.0
401  br i1 %z.cmp, label %.continue, label %.demote
402
403.demote:
404  call void @llvm.amdgcn.wqm.demote(i1 false)
405  br label %.continue
406
407.continue:
408  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
409  %tex0 = extractelement <4 x float> %tex, i32 0
410  %tex1 = extractelement <4 x float> %tex, i32 0
411  %coord1 = fadd float %tex0, %tex1
412  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
413
414  ret <4 x float> %rtex
415}
416
417define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
418; SI-LABEL: wqm_demote_2:
419; SI:       ; %bb.0: ; %.entry
420; SI-NEXT:    s_mov_b64 s[12:13], exec
421; SI-NEXT:    s_wqm_b64 exec, exec
422; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
423; SI-NEXT:    s_waitcnt vmcnt(0)
424; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
425; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
426; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
427; SI-NEXT:    s_cbranch_execz .LBB4_3
428; SI-NEXT:  ; %bb.1: ; %.demote
429; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
430; SI-NEXT:    s_cbranch_scc0 .LBB4_4
431; SI-NEXT:  ; %bb.2: ; %.demote
432; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
433; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
434; SI-NEXT:  .LBB4_3: ; %.continue
435; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
436; SI-NEXT:    v_add_f32_e32 v0, v0, v0
437; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
438; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
439; SI-NEXT:    s_waitcnt vmcnt(0)
440; SI-NEXT:    s_branch .LBB4_5
441; SI-NEXT:  .LBB4_4:
442; SI-NEXT:    s_mov_b64 exec, 0
443; SI-NEXT:    exp null off, off, off, off done vm
444; SI-NEXT:    s_endpgm
445; SI-NEXT:  .LBB4_5:
446;
447; GFX9-LABEL: wqm_demote_2:
448; GFX9:       ; %bb.0: ; %.entry
449; GFX9-NEXT:    s_mov_b64 s[12:13], exec
450; GFX9-NEXT:    s_wqm_b64 exec, exec
451; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
452; GFX9-NEXT:    s_waitcnt vmcnt(0)
453; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
454; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
455; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
456; GFX9-NEXT:    s_cbranch_execz .LBB4_3
457; GFX9-NEXT:  ; %bb.1: ; %.demote
458; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
459; GFX9-NEXT:    s_cbranch_scc0 .LBB4_4
460; GFX9-NEXT:  ; %bb.2: ; %.demote
461; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
462; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
463; GFX9-NEXT:  .LBB4_3: ; %.continue
464; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
465; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
466; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
467; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
468; GFX9-NEXT:    s_waitcnt vmcnt(0)
469; GFX9-NEXT:    s_branch .LBB4_5
470; GFX9-NEXT:  .LBB4_4:
471; GFX9-NEXT:    s_mov_b64 exec, 0
472; GFX9-NEXT:    exp null off, off, off, off done vm
473; GFX9-NEXT:    s_endpgm
474; GFX9-NEXT:  .LBB4_5:
475;
476; GFX10-32-LABEL: wqm_demote_2:
477; GFX10-32:       ; %bb.0: ; %.entry
478; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
479; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
480; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
481; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
482; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v0
483; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
484; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
485; GFX10-32-NEXT:    s_cbranch_execz .LBB4_3
486; GFX10-32-NEXT:  ; %bb.1: ; %.demote
487; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
488; GFX10-32-NEXT:    s_cbranch_scc0 .LBB4_4
489; GFX10-32-NEXT:  ; %bb.2: ; %.demote
490; GFX10-32-NEXT:    s_wqm_b32 s14, s12
491; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s14
492; GFX10-32-NEXT:  .LBB4_3: ; %.continue
493; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
494; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
495; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
496; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
497; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
498; GFX10-32-NEXT:    s_branch .LBB4_5
499; GFX10-32-NEXT:  .LBB4_4:
500; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
501; GFX10-32-NEXT:    exp null off, off, off, off done vm
502; GFX10-32-NEXT:    s_endpgm
503; GFX10-32-NEXT:  .LBB4_5:
504;
505; GFX10-64-LABEL: wqm_demote_2:
506; GFX10-64:       ; %bb.0: ; %.entry
507; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
508; GFX10-64-NEXT:    s_wqm_b64 exec, exec
509; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
510; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
511; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
512; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
513; GFX10-64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
514; GFX10-64-NEXT:    s_cbranch_execz .LBB4_3
515; GFX10-64-NEXT:  ; %bb.1: ; %.demote
516; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
517; GFX10-64-NEXT:    s_cbranch_scc0 .LBB4_4
518; GFX10-64-NEXT:  ; %bb.2: ; %.demote
519; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
520; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
521; GFX10-64-NEXT:  .LBB4_3: ; %.continue
522; GFX10-64-NEXT:    s_or_b64 exec, exec, s[14:15]
523; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
524; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
525; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
526; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
527; GFX10-64-NEXT:    s_branch .LBB4_5
528; GFX10-64-NEXT:  .LBB4_4:
529; GFX10-64-NEXT:    s_mov_b64 exec, 0
530; GFX10-64-NEXT:    exp null off, off, off, off done vm
531; GFX10-64-NEXT:    s_endpgm
532; GFX10-64-NEXT:  .LBB4_5:
533.entry:
534  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
535  %tex0 = extractelement <4 x float> %tex, i32 0
536  %tex1 = extractelement <4 x float> %tex, i32 0
537  %z.cmp = fcmp olt float %tex0, 0.0
538  br i1 %z.cmp, label %.continue, label %.demote
539
540.demote:
541  call void @llvm.amdgcn.wqm.demote(i1 false)
542  br label %.continue
543
544.continue:
545  %coord1 = fadd float %tex0, %tex1
546  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
547
548  ret <4 x float> %rtex
549}
550
551define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
552; SI-LABEL: wqm_demote_dynamic:
553; SI:       ; %bb.0: ; %.entry
554; SI-NEXT:    s_mov_b64 s[12:13], exec
555; SI-NEXT:    s_wqm_b64 exec, exec
556; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
557; SI-NEXT:    s_waitcnt vmcnt(0)
558; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
559; SI-NEXT:    s_andn2_b64 s[14:15], exec, vcc
560; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
561; SI-NEXT:    s_cbranch_scc0 .LBB5_2
562; SI-NEXT:  ; %bb.1: ; %.entry
563; SI-NEXT:    s_wqm_b64 s[14:15], s[12:13]
564; SI-NEXT:    s_and_b64 exec, exec, s[14:15]
565; SI-NEXT:    v_add_f32_e32 v0, v0, v0
566; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
567; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
568; SI-NEXT:    s_waitcnt vmcnt(0)
569; SI-NEXT:    s_branch .LBB5_3
570; SI-NEXT:  .LBB5_2:
571; SI-NEXT:    s_mov_b64 exec, 0
572; SI-NEXT:    exp null off, off, off, off done vm
573; SI-NEXT:    s_endpgm
574; SI-NEXT:  .LBB5_3:
575;
576; GFX9-LABEL: wqm_demote_dynamic:
577; GFX9:       ; %bb.0: ; %.entry
578; GFX9-NEXT:    s_mov_b64 s[12:13], exec
579; GFX9-NEXT:    s_wqm_b64 exec, exec
580; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
581; GFX9-NEXT:    s_waitcnt vmcnt(0)
582; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
583; GFX9-NEXT:    s_andn2_b64 s[14:15], exec, vcc
584; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
585; GFX9-NEXT:    s_cbranch_scc0 .LBB5_2
586; GFX9-NEXT:  ; %bb.1: ; %.entry
587; GFX9-NEXT:    s_wqm_b64 s[14:15], s[12:13]
588; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
589; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
590; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
591; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
592; GFX9-NEXT:    s_waitcnt vmcnt(0)
593; GFX9-NEXT:    s_branch .LBB5_3
594; GFX9-NEXT:  .LBB5_2:
595; GFX9-NEXT:    s_mov_b64 exec, 0
596; GFX9-NEXT:    exp null off, off, off, off done vm
597; GFX9-NEXT:    s_endpgm
598; GFX9-NEXT:  .LBB5_3:
599;
600; GFX10-32-LABEL: wqm_demote_dynamic:
601; GFX10-32:       ; %bb.0: ; %.entry
602; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
603; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
604; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
605; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
606; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
607; GFX10-32-NEXT:    s_andn2_b32 s13, exec_lo, vcc_lo
608; GFX10-32-NEXT:    s_andn2_b32 s12, s12, s13
609; GFX10-32-NEXT:    s_cbranch_scc0 .LBB5_2
610; GFX10-32-NEXT:  ; %bb.1: ; %.entry
611; GFX10-32-NEXT:    s_wqm_b32 s13, s12
612; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s13
613; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
614; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
615; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
616; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
617; GFX10-32-NEXT:    s_branch .LBB5_3
618; GFX10-32-NEXT:  .LBB5_2:
619; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
620; GFX10-32-NEXT:    exp null off, off, off, off done vm
621; GFX10-32-NEXT:    s_endpgm
622; GFX10-32-NEXT:  .LBB5_3:
623;
624; GFX10-64-LABEL: wqm_demote_dynamic:
625; GFX10-64:       ; %bb.0: ; %.entry
626; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
627; GFX10-64-NEXT:    s_wqm_b64 exec, exec
628; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
629; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
630; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
631; GFX10-64-NEXT:    s_andn2_b64 s[14:15], exec, vcc
632; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
633; GFX10-64-NEXT:    s_cbranch_scc0 .LBB5_2
634; GFX10-64-NEXT:  ; %bb.1: ; %.entry
635; GFX10-64-NEXT:    s_wqm_b64 s[14:15], s[12:13]
636; GFX10-64-NEXT:    s_and_b64 exec, exec, s[14:15]
637; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
638; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
639; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
640; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
641; GFX10-64-NEXT:    s_branch .LBB5_3
642; GFX10-64-NEXT:  .LBB5_2:
643; GFX10-64-NEXT:    s_mov_b64 exec, 0
644; GFX10-64-NEXT:    exp null off, off, off, off done vm
645; GFX10-64-NEXT:    s_endpgm
646; GFX10-64-NEXT:  .LBB5_3:
647.entry:
648  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
649  %tex0 = extractelement <4 x float> %tex, i32 0
650  %tex1 = extractelement <4 x float> %tex, i32 0
651  %z.cmp = fcmp olt float %tex0, 0.0
652  call void @llvm.amdgcn.wqm.demote(i1 %z.cmp)
653  %coord1 = fadd float %tex0, %tex1
654  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
655
656  ret <4 x float> %rtex
657}
658
659define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
660; SI-LABEL: wqm_deriv:
661; SI:       ; %bb.0: ; %.entry
662; SI-NEXT:    s_mov_b64 s[0:1], exec
663; SI-NEXT:    s_wqm_b64 exec, exec
664; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
665; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
666; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
667; SI-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
668; SI-NEXT:    s_cbranch_execz .LBB6_3
669; SI-NEXT:  ; %bb.1: ; %.demote0
670; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
671; SI-NEXT:    s_cbranch_scc0 .LBB6_7
672; SI-NEXT:  ; %bb.2: ; %.demote0
673; SI-NEXT:    s_wqm_b64 s[4:5], s[0:1]
674; SI-NEXT:    s_and_b64 exec, exec, s[4:5]
675; SI-NEXT:  .LBB6_3: ; %.continue0
676; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
677; SI-NEXT:    s_mov_b64 s[2:3], s[0:1]
678; SI-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
679; SI-NEXT:    v_mov_b32_e32 v1, v0
680; SI-NEXT:    s_nop 1
681; SI-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
682; SI-NEXT:    s_nop 1
683; SI-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
684; SI-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
685; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
686; SI-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v0
687; SI-NEXT:    s_and_b64 s[2:3], s[0:1], vcc
688; SI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
689; SI-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
690; SI-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
691; SI-NEXT:    s_cbranch_execz .LBB6_6
692; SI-NEXT:  ; %bb.4: ; %.demote1
693; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
694; SI-NEXT:    s_cbranch_scc0 .LBB6_7
695; SI-NEXT:  ; %bb.5: ; %.demote1
696; SI-NEXT:    s_mov_b64 exec, 0
697; SI-NEXT:  .LBB6_6: ; %.continue1
698; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
699; SI-NEXT:    v_mov_b32_e32 v0, 0x3c00
700; SI-NEXT:    v_bfrev_b32_e32 v1, 60
701; SI-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
702; SI-NEXT:    s_endpgm
703; SI-NEXT:  .LBB6_7:
704; SI-NEXT:    s_mov_b64 exec, 0
705; SI-NEXT:    exp null off, off, off, off done vm
706; SI-NEXT:    s_endpgm
707;
708; GFX9-LABEL: wqm_deriv:
709; GFX9:       ; %bb.0: ; %.entry
710; GFX9-NEXT:    s_mov_b64 s[0:1], exec
711; GFX9-NEXT:    s_wqm_b64 exec, exec
712; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
713; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
714; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
715; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
716; GFX9-NEXT:    s_cbranch_execz .LBB6_3
717; GFX9-NEXT:  ; %bb.1: ; %.demote0
718; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
719; GFX9-NEXT:    s_cbranch_scc0 .LBB6_7
720; GFX9-NEXT:  ; %bb.2: ; %.demote0
721; GFX9-NEXT:    s_wqm_b64 s[4:5], s[0:1]
722; GFX9-NEXT:    s_and_b64 exec, exec, s[4:5]
723; GFX9-NEXT:  .LBB6_3: ; %.continue0
724; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
725; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
726; GFX9-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
727; GFX9-NEXT:    v_mov_b32_e32 v1, v0
728; GFX9-NEXT:    s_nop 1
729; GFX9-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
730; GFX9-NEXT:    s_nop 1
731; GFX9-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
732; GFX9-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
733; GFX9-NEXT:    s_and_b64 exec, exec, s[0:1]
734; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v0
735; GFX9-NEXT:    s_and_b64 s[2:3], s[0:1], vcc
736; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
737; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
738; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
739; GFX9-NEXT:    s_cbranch_execz .LBB6_6
740; GFX9-NEXT:  ; %bb.4: ; %.demote1
741; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
742; GFX9-NEXT:    s_cbranch_scc0 .LBB6_7
743; GFX9-NEXT:  ; %bb.5: ; %.demote1
744; GFX9-NEXT:    s_mov_b64 exec, 0
745; GFX9-NEXT:  .LBB6_6: ; %.continue1
746; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
747; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
748; GFX9-NEXT:    v_bfrev_b32_e32 v1, 60
749; GFX9-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
750; GFX9-NEXT:    s_endpgm
751; GFX9-NEXT:  .LBB6_7:
752; GFX9-NEXT:    s_mov_b64 exec, 0
753; GFX9-NEXT:    exp null off, off, off, off done vm
754; GFX9-NEXT:    s_endpgm
755;
756; GFX10-32-LABEL: wqm_deriv:
757; GFX10-32:       ; %bb.0: ; %.entry
758; GFX10-32-NEXT:    s_mov_b32 s0, exec_lo
759; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
760; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
761; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
762; GFX10-32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
763; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s1
764; GFX10-32-NEXT:    s_cbranch_execz .LBB6_3
765; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
766; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
767; GFX10-32-NEXT:    s_cbranch_scc0 .LBB6_7
768; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
769; GFX10-32-NEXT:    s_wqm_b32 s2, s0
770; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
771; GFX10-32-NEXT:  .LBB6_3: ; %.continue0
772; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
773; GFX10-32-NEXT:    s_mov_b32 s1, s0
774; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s1
775; GFX10-32-NEXT:    v_mov_b32_e32 v1, v0
776; GFX10-32-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
777; GFX10-32-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
778; GFX10-32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
779; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
780; GFX10-32-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
781; GFX10-32-NEXT:    s_and_b32 s1, s0, vcc_lo
782; GFX10-32-NEXT:    s_xor_b32 s1, s1, -1
783; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s1
784; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s2
785; GFX10-32-NEXT:    s_cbranch_execz .LBB6_6
786; GFX10-32-NEXT:  ; %bb.4: ; %.demote1
787; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
788; GFX10-32-NEXT:    s_cbranch_scc0 .LBB6_7
789; GFX10-32-NEXT:  ; %bb.5: ; %.demote1
790; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
791; GFX10-32-NEXT:  .LBB6_6: ; %.continue1
792; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
793; GFX10-32-NEXT:    v_mov_b32_e32 v0, 0x3c00
794; GFX10-32-NEXT:    v_bfrev_b32_e32 v1, 60
795; GFX10-32-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
796; GFX10-32-NEXT:    s_endpgm
797; GFX10-32-NEXT:  .LBB6_7:
798; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
799; GFX10-32-NEXT:    exp null off, off, off, off done vm
800; GFX10-32-NEXT:    s_endpgm
801;
802; GFX10-64-LABEL: wqm_deriv:
803; GFX10-64:       ; %bb.0: ; %.entry
804; GFX10-64-NEXT:    s_mov_b64 s[0:1], exec
805; GFX10-64-NEXT:    s_wqm_b64 exec, exec
806; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
807; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
808; GFX10-64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
809; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
810; GFX10-64-NEXT:    s_cbranch_execz .LBB6_3
811; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
812; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
813; GFX10-64-NEXT:    s_cbranch_scc0 .LBB6_7
814; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
815; GFX10-64-NEXT:    s_wqm_b64 s[4:5], s[0:1]
816; GFX10-64-NEXT:    s_and_b64 exec, exec, s[4:5]
817; GFX10-64-NEXT:  .LBB6_3: ; %.continue0
818; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
819; GFX10-64-NEXT:    s_mov_b64 s[2:3], s[0:1]
820; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
821; GFX10-64-NEXT:    v_mov_b32_e32 v1, v0
822; GFX10-64-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
823; GFX10-64-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
824; GFX10-64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
825; GFX10-64-NEXT:    s_and_b64 exec, exec, s[0:1]
826; GFX10-64-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v0
827; GFX10-64-NEXT:    s_and_b64 s[2:3], s[0:1], vcc
828; GFX10-64-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
829; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
830; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
831; GFX10-64-NEXT:    s_cbranch_execz .LBB6_6
832; GFX10-64-NEXT:  ; %bb.4: ; %.demote1
833; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
834; GFX10-64-NEXT:    s_cbranch_scc0 .LBB6_7
835; GFX10-64-NEXT:  ; %bb.5: ; %.demote1
836; GFX10-64-NEXT:    s_mov_b64 exec, 0
837; GFX10-64-NEXT:  .LBB6_6: ; %.continue1
838; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
839; GFX10-64-NEXT:    v_mov_b32_e32 v0, 0x3c00
840; GFX10-64-NEXT:    v_bfrev_b32_e32 v1, 60
841; GFX10-64-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
842; GFX10-64-NEXT:    s_endpgm
843; GFX10-64-NEXT:  .LBB6_7:
844; GFX10-64-NEXT:    s_mov_b64 exec, 0
845; GFX10-64-NEXT:    exp null off, off, off, off done vm
846; GFX10-64-NEXT:    s_endpgm
847.entry:
848  %p0 = extractelement <2 x float> %input, i32 0
849  %p1 = extractelement <2 x float> %input, i32 1
850  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 0, i32 0, i32 %index) #2
851  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 0, i32 0, i32 %index) #2
852  %argi = fptosi float %arg to i32
853  %cond0 = icmp eq i32 %argi, 0
854  br i1 %cond0, label %.continue0, label %.demote0
855
856.demote0:
857  call void @llvm.amdgcn.wqm.demote(i1 false)
858  br label %.continue0
859
860.continue0:
861  %live = call i1 @llvm.amdgcn.live.mask()
862  %live.cond = select i1 %live, i32 0, i32 1065353216
863  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
864  %live.v0f = bitcast i32 %live.v0 to float
865  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
866  %live.v1f = bitcast i32 %live.v1 to float
867  %v0 = fsub float %live.v0f, %live.v1f
868  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
869  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
870  %cond2 = and i1 %live, %cond1
871  br i1 %cond2, label %.continue1, label %.demote1
872
873.demote1:
874  call void @llvm.amdgcn.wqm.demote(i1 false)
875  br label %.continue1
876
877.continue1:
878  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 true, i1 true) #3
879  ret void
880}
881
882define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) {
883; SI-LABEL: wqm_deriv_loop:
884; SI:       ; %bb.0: ; %.entry
885; SI-NEXT:    s_mov_b64 s[0:1], exec
886; SI-NEXT:    s_wqm_b64 exec, exec
887; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
888; SI-NEXT:    s_mov_b32 s4, 0
889; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
890; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
891; SI-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
892; SI-NEXT:    s_cbranch_execz .LBB7_3
893; SI-NEXT:  ; %bb.1: ; %.demote0
894; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
895; SI-NEXT:    s_cbranch_scc0 .LBB7_9
896; SI-NEXT:  ; %bb.2: ; %.demote0
897; SI-NEXT:    s_wqm_b64 s[6:7], s[0:1]
898; SI-NEXT:    s_and_b64 exec, exec, s[6:7]
899; SI-NEXT:  .LBB7_3: ; %.continue0.preheader
900; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
901; SI-NEXT:    s_mov_b64 s[2:3], 0
902; SI-NEXT:    v_mov_b32_e32 v0, s4
903; SI-NEXT:    s_branch .LBB7_5
904; SI-NEXT:  .LBB7_4: ; %.continue1
905; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
906; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
907; SI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
908; SI-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
909; SI-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
910; SI-NEXT:    s_andn2_b64 exec, exec, s[2:3]
911; SI-NEXT:    s_cbranch_execz .LBB7_8
912; SI-NEXT:  .LBB7_5: ; %.continue0
913; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
914; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
915; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[4:5]
916; SI-NEXT:    v_mov_b32_e32 v3, v2
917; SI-NEXT:    s_nop 1
918; SI-NEXT:    v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
919; SI-NEXT:    s_nop 1
920; SI-NEXT:    v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
921; SI-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
922; SI-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
923; SI-NEXT:    s_and_b64 s[4:5], s[0:1], vcc
924; SI-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
925; SI-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
926; SI-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
927; SI-NEXT:    s_cbranch_execz .LBB7_4
928; SI-NEXT:  ; %bb.6: ; %.demote1
929; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
930; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
931; SI-NEXT:    s_cbranch_scc0 .LBB7_9
932; SI-NEXT:  ; %bb.7: ; %.demote1
933; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
934; SI-NEXT:    s_wqm_b64 s[6:7], s[0:1]
935; SI-NEXT:    s_and_b64 exec, exec, s[6:7]
936; SI-NEXT:    s_branch .LBB7_4
937; SI-NEXT:  .LBB7_8: ; %.return
938; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
939; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
940; SI-NEXT:    v_mov_b32_e32 v0, 0x3c00
941; SI-NEXT:    v_bfrev_b32_e32 v1, 60
942; SI-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
943; SI-NEXT:    s_endpgm
944; SI-NEXT:  .LBB7_9:
945; SI-NEXT:    s_mov_b64 exec, 0
946; SI-NEXT:    exp null off, off, off, off done vm
947; SI-NEXT:    s_endpgm
948;
949; GFX9-LABEL: wqm_deriv_loop:
950; GFX9:       ; %bb.0: ; %.entry
951; GFX9-NEXT:    s_mov_b64 s[0:1], exec
952; GFX9-NEXT:    s_wqm_b64 exec, exec
953; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
954; GFX9-NEXT:    s_mov_b32 s4, 0
955; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
956; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
957; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
958; GFX9-NEXT:    s_cbranch_execz .LBB7_3
959; GFX9-NEXT:  ; %bb.1: ; %.demote0
960; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
961; GFX9-NEXT:    s_cbranch_scc0 .LBB7_9
962; GFX9-NEXT:  ; %bb.2: ; %.demote0
963; GFX9-NEXT:    s_wqm_b64 s[6:7], s[0:1]
964; GFX9-NEXT:    s_and_b64 exec, exec, s[6:7]
965; GFX9-NEXT:  .LBB7_3: ; %.continue0.preheader
966; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
967; GFX9-NEXT:    s_mov_b64 s[2:3], 0
968; GFX9-NEXT:    v_mov_b32_e32 v0, s4
969; GFX9-NEXT:    s_branch .LBB7_5
970; GFX9-NEXT:  .LBB7_4: ; %.continue1
971; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
972; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
973; GFX9-NEXT:    v_add_u32_e32 v0, 1, v0
974; GFX9-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
975; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
976; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
977; GFX9-NEXT:    s_cbranch_execz .LBB7_8
978; GFX9-NEXT:  .LBB7_5: ; %.continue0
979; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
980; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
981; GFX9-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[4:5]
982; GFX9-NEXT:    v_mov_b32_e32 v3, v2
983; GFX9-NEXT:    s_nop 1
984; GFX9-NEXT:    v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
985; GFX9-NEXT:    s_nop 1
986; GFX9-NEXT:    v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
987; GFX9-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
988; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
989; GFX9-NEXT:    s_and_b64 s[4:5], s[0:1], vcc
990; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
991; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
992; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
993; GFX9-NEXT:    s_cbranch_execz .LBB7_4
994; GFX9-NEXT:  ; %bb.6: ; %.demote1
995; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
996; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
997; GFX9-NEXT:    s_cbranch_scc0 .LBB7_9
998; GFX9-NEXT:  ; %bb.7: ; %.demote1
999; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1000; GFX9-NEXT:    s_wqm_b64 s[6:7], s[0:1]
1001; GFX9-NEXT:    s_and_b64 exec, exec, s[6:7]
1002; GFX9-NEXT:    s_branch .LBB7_4
1003; GFX9-NEXT:  .LBB7_8: ; %.return
1004; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1005; GFX9-NEXT:    s_and_b64 exec, exec, s[0:1]
1006; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
1007; GFX9-NEXT:    v_bfrev_b32_e32 v1, 60
1008; GFX9-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
1009; GFX9-NEXT:    s_endpgm
1010; GFX9-NEXT:  .LBB7_9:
1011; GFX9-NEXT:    s_mov_b64 exec, 0
1012; GFX9-NEXT:    exp null off, off, off, off done vm
1013; GFX9-NEXT:    s_endpgm
1014;
1015; GFX10-32-LABEL: wqm_deriv_loop:
1016; GFX10-32:       ; %bb.0: ; %.entry
1017; GFX10-32-NEXT:    s_mov_b32 s0, exec_lo
1018; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1019; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
1020; GFX10-32-NEXT:    s_mov_b32 s1, 0
1021; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
1022; GFX10-32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1023; GFX10-32-NEXT:    s_xor_b32 s2, exec_lo, s2
1024; GFX10-32-NEXT:    s_cbranch_execz .LBB7_3
1025; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
1026; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
1027; GFX10-32-NEXT:    s_cbranch_scc0 .LBB7_9
1028; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
1029; GFX10-32-NEXT:    s_wqm_b32 s3, s0
1030; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s3
1031; GFX10-32-NEXT:  .LBB7_3: ; %.continue0.preheader
1032; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1033; GFX10-32-NEXT:    v_mov_b32_e32 v0, s1
1034; GFX10-32-NEXT:    s_branch .LBB7_5
1035; GFX10-32-NEXT:  .LBB7_4: ; %.continue1
1036; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1037; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1038; GFX10-32-NEXT:    v_add_nc_u32_e32 v0, 1, v0
1039; GFX10-32-NEXT:    v_cmp_ge_i32_e32 vcc_lo, v0, v1
1040; GFX10-32-NEXT:    s_or_b32 s1, vcc_lo, s1
1041; GFX10-32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s1
1042; GFX10-32-NEXT:    s_cbranch_execz .LBB7_8
1043; GFX10-32-NEXT:  .LBB7_5: ; %.continue0
1044; GFX10-32-NEXT:    ; =>This Inner Loop Header: Depth=1
1045; GFX10-32-NEXT:    s_mov_b32 s2, s0
1046; GFX10-32-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s2
1047; GFX10-32-NEXT:    v_mov_b32_e32 v3, v2
1048; GFX10-32-NEXT:    v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
1049; GFX10-32-NEXT:    v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
1050; GFX10-32-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
1051; GFX10-32-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
1052; GFX10-32-NEXT:    s_and_b32 s2, s0, vcc_lo
1053; GFX10-32-NEXT:    s_xor_b32 s2, s2, -1
1054; GFX10-32-NEXT:    s_and_saveexec_b32 s3, s2
1055; GFX10-32-NEXT:    s_xor_b32 s2, exec_lo, s3
1056; GFX10-32-NEXT:    s_cbranch_execz .LBB7_4
1057; GFX10-32-NEXT:  ; %bb.6: ; %.demote1
1058; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1059; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
1060; GFX10-32-NEXT:    s_cbranch_scc0 .LBB7_9
1061; GFX10-32-NEXT:  ; %bb.7: ; %.demote1
1062; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1063; GFX10-32-NEXT:    s_wqm_b32 s3, s0
1064; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s3
1065; GFX10-32-NEXT:    s_branch .LBB7_4
1066; GFX10-32-NEXT:  .LBB7_8: ; %.return
1067; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1068; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
1069; GFX10-32-NEXT:    v_mov_b32_e32 v0, 0x3c00
1070; GFX10-32-NEXT:    v_bfrev_b32_e32 v1, 60
1071; GFX10-32-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
1072; GFX10-32-NEXT:    s_endpgm
1073; GFX10-32-NEXT:  .LBB7_9:
1074; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
1075; GFX10-32-NEXT:    exp null off, off, off, off done vm
1076; GFX10-32-NEXT:    s_endpgm
1077;
1078; GFX10-64-LABEL: wqm_deriv_loop:
1079; GFX10-64:       ; %bb.0: ; %.entry
1080; GFX10-64-NEXT:    s_mov_b64 s[0:1], exec
1081; GFX10-64-NEXT:    s_wqm_b64 exec, exec
1082; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
1083; GFX10-64-NEXT:    s_mov_b32 s4, 0
1084; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1085; GFX10-64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1086; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
1087; GFX10-64-NEXT:    s_cbranch_execz .LBB7_3
1088; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
1089; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
1090; GFX10-64-NEXT:    s_cbranch_scc0 .LBB7_9
1091; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
1092; GFX10-64-NEXT:    s_wqm_b64 s[6:7], s[0:1]
1093; GFX10-64-NEXT:    s_and_b64 exec, exec, s[6:7]
1094; GFX10-64-NEXT:  .LBB7_3: ; %.continue0.preheader
1095; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
1096; GFX10-64-NEXT:    v_mov_b32_e32 v0, s4
1097; GFX10-64-NEXT:    s_mov_b64 s[2:3], 0
1098; GFX10-64-NEXT:    s_branch .LBB7_5
1099; GFX10-64-NEXT:  .LBB7_4: ; %.continue1
1100; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1101; GFX10-64-NEXT:    s_or_b64 exec, exec, s[4:5]
1102; GFX10-64-NEXT:    v_add_nc_u32_e32 v0, 1, v0
1103; GFX10-64-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
1104; GFX10-64-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
1105; GFX10-64-NEXT:    s_andn2_b64 exec, exec, s[2:3]
1106; GFX10-64-NEXT:    s_cbranch_execz .LBB7_8
1107; GFX10-64-NEXT:  .LBB7_5: ; %.continue0
1108; GFX10-64-NEXT:    ; =>This Inner Loop Header: Depth=1
1109; GFX10-64-NEXT:    s_mov_b64 s[4:5], s[0:1]
1110; GFX10-64-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[4:5]
1111; GFX10-64-NEXT:    v_mov_b32_e32 v3, v2
1112; GFX10-64-NEXT:    v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
1113; GFX10-64-NEXT:    v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
1114; GFX10-64-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
1115; GFX10-64-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
1116; GFX10-64-NEXT:    s_and_b64 s[4:5], s[0:1], vcc
1117; GFX10-64-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
1118; GFX10-64-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
1119; GFX10-64-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
1120; GFX10-64-NEXT:    s_cbranch_execz .LBB7_4
1121; GFX10-64-NEXT:  ; %bb.6: ; %.demote1
1122; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1123; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
1124; GFX10-64-NEXT:    s_cbranch_scc0 .LBB7_9
1125; GFX10-64-NEXT:  ; %bb.7: ; %.demote1
1126; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1127; GFX10-64-NEXT:    s_wqm_b64 s[6:7], s[0:1]
1128; GFX10-64-NEXT:    s_and_b64 exec, exec, s[6:7]
1129; GFX10-64-NEXT:    s_branch .LBB7_4
1130; GFX10-64-NEXT:  .LBB7_8: ; %.return
1131; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
1132; GFX10-64-NEXT:    s_and_b64 exec, exec, s[0:1]
1133; GFX10-64-NEXT:    v_mov_b32_e32 v0, 0x3c00
1134; GFX10-64-NEXT:    v_bfrev_b32_e32 v1, 60
1135; GFX10-64-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
1136; GFX10-64-NEXT:    s_endpgm
1137; GFX10-64-NEXT:  .LBB7_9:
1138; GFX10-64-NEXT:    s_mov_b64 exec, 0
1139; GFX10-64-NEXT:    exp null off, off, off, off done vm
1140; GFX10-64-NEXT:    s_endpgm
1141.entry:
1142  %p0 = extractelement <2 x float> %input, i32 0
1143  %p1 = extractelement <2 x float> %input, i32 1
1144  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 0, i32 0, i32 %index) #2
1145  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 0, i32 0, i32 %index) #2
1146  %argi = fptosi float %arg to i32
1147  %cond0 = icmp eq i32 %argi, 0
1148  br i1 %cond0, label %.continue0, label %.demote0
1149
1150.demote0:
1151  call void @llvm.amdgcn.wqm.demote(i1 false)
1152  br label %.continue0
1153
1154.continue0:
1155  %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ]
1156  %live = call i1 @llvm.amdgcn.live.mask()
1157  %live.cond = select i1 %live, i32 0, i32 %count
1158  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
1159  %live.v0f = bitcast i32 %live.v0 to float
1160  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
1161  %live.v1f = bitcast i32 %live.v1 to float
1162  %v0 = fsub float %live.v0f, %live.v1f
1163  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
1164  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
1165  %cond2 = and i1 %live, %cond1
1166  br i1 %cond2, label %.continue1, label %.demote1
1167
1168.demote1:
1169  call void @llvm.amdgcn.wqm.demote(i1 false)
1170  br label %.continue1
1171
1172.continue1:
1173  %next = add i32 %count, 1
1174  %loop.cond = icmp slt i32 %next, %limit
1175  br i1 %loop.cond, label %.continue0, label %.return
1176
1177.return:
1178  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 true, i1 true) #3
1179  ret void
1180}
1181
1182declare void @llvm.amdgcn.wqm.demote(i1) #0
1183declare i1 @llvm.amdgcn.live.mask() #0
1184declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
1185declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
1186declare float @llvm.amdgcn.wqm.f32(float) #1
1187declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2
1188declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2
1189declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3
1190declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4
1191
1192attributes #0 = { nounwind }
1193attributes #1 = { nounwind readnone }
1194attributes #2 = { nounwind readnone speculatable }
1195attributes #3 = { inaccessiblememonly nounwind }
1196attributes #4 = { convergent nounwind readnone }
1197