xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll (revision 40fa7f5e8b315159d45aa280c771af5998bdc75e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-32 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-64 %s
6
7define amdgpu_ps void @static_exact(float %arg0, float %arg1) {
8; SI-LABEL: static_exact:
9; SI:       ; %bb.0: ; %.entry
10; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
11; SI-NEXT:    s_andn2_b64 exec, exec, exec
12; SI-NEXT:    s_cbranch_scc0 .LBB0_2
13; SI-NEXT:  ; %bb.1: ; %.entry
14; SI-NEXT:    s_mov_b64 exec, 0
15; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
16; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
17; SI-NEXT:    s_endpgm
18; SI-NEXT:  .LBB0_2:
19; SI-NEXT:    s_mov_b64 exec, 0
20; SI-NEXT:    exp null off, off, off, off done vm
21; SI-NEXT:    s_endpgm
22;
23; GFX9-LABEL: static_exact:
24; GFX9:       ; %bb.0: ; %.entry
25; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
26; GFX9-NEXT:    s_andn2_b64 exec, exec, exec
27; GFX9-NEXT:    s_cbranch_scc0 .LBB0_2
28; GFX9-NEXT:  ; %bb.1: ; %.entry
29; GFX9-NEXT:    s_mov_b64 exec, 0
30; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
31; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
32; GFX9-NEXT:    s_endpgm
33; GFX9-NEXT:  .LBB0_2:
34; GFX9-NEXT:    s_mov_b64 exec, 0
35; GFX9-NEXT:    exp null off, off, off, off done vm
36; GFX9-NEXT:    s_endpgm
37;
38; GFX10-32-LABEL: static_exact:
39; GFX10-32:       ; %bb.0: ; %.entry
40; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
41; GFX10-32-NEXT:    s_andn2_b32 exec_lo, exec_lo, exec_lo
42; GFX10-32-NEXT:    s_cbranch_scc0 .LBB0_2
43; GFX10-32-NEXT:  ; %bb.1: ; %.entry
44; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
45; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
46; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
47; GFX10-32-NEXT:    s_endpgm
48; GFX10-32-NEXT:  .LBB0_2:
49; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
50; GFX10-32-NEXT:    exp null off, off, off, off done vm
51; GFX10-32-NEXT:    s_endpgm
52;
53; GFX10-64-LABEL: static_exact:
54; GFX10-64:       ; %bb.0: ; %.entry
55; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
56; GFX10-64-NEXT:    s_andn2_b64 exec, exec, exec
57; GFX10-64-NEXT:    s_cbranch_scc0 .LBB0_2
58; GFX10-64-NEXT:  ; %bb.1: ; %.entry
59; GFX10-64-NEXT:    s_mov_b64 exec, 0
60; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
61; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
62; GFX10-64-NEXT:    s_endpgm
63; GFX10-64-NEXT:  .LBB0_2:
64; GFX10-64-NEXT:    s_mov_b64 exec, 0
65; GFX10-64-NEXT:    exp null off, off, off, off done vm
66; GFX10-64-NEXT:    s_endpgm
67.entry:
68  %c0 = fcmp olt float %arg0, 0.000000e+00
69  %c1 = fcmp oge float %arg1, 0.0
70  call void @llvm.amdgcn.wqm.demote(i1 false)
71  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
72  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
73  ret void
74}
75
76define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
77; SI-LABEL: dynamic_exact:
78; SI:       ; %bb.0: ; %.entry
79; SI-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
80; SI-NEXT:    s_mov_b64 s[2:3], exec
81; SI-NEXT:    s_andn2_b64 s[0:1], exec, s[0:1]
82; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
83; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
84; SI-NEXT:    s_cbranch_scc0 .LBB1_2
85; SI-NEXT:  ; %bb.1: ; %.entry
86; SI-NEXT:    s_and_b64 exec, exec, s[2:3]
87; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
88; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
89; SI-NEXT:    s_endpgm
90; SI-NEXT:  .LBB1_2:
91; SI-NEXT:    s_mov_b64 exec, 0
92; SI-NEXT:    exp null off, off, off, off done vm
93; SI-NEXT:    s_endpgm
94;
95; GFX9-LABEL: dynamic_exact:
96; GFX9:       ; %bb.0: ; %.entry
97; GFX9-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
98; GFX9-NEXT:    s_mov_b64 s[2:3], exec
99; GFX9-NEXT:    s_andn2_b64 s[0:1], exec, s[0:1]
100; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
101; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
102; GFX9-NEXT:    s_cbranch_scc0 .LBB1_2
103; GFX9-NEXT:  ; %bb.1: ; %.entry
104; GFX9-NEXT:    s_and_b64 exec, exec, s[2:3]
105; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
106; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
107; GFX9-NEXT:    s_endpgm
108; GFX9-NEXT:  .LBB1_2:
109; GFX9-NEXT:    s_mov_b64 exec, 0
110; GFX9-NEXT:    exp null off, off, off, off done vm
111; GFX9-NEXT:    s_endpgm
112;
113; GFX10-32-LABEL: dynamic_exact:
114; GFX10-32:       ; %bb.0: ; %.entry
115; GFX10-32-NEXT:    v_cmp_le_f32_e64 s0, 0, v1
116; GFX10-32-NEXT:    s_mov_b32 s1, exec_lo
117; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
118; GFX10-32-NEXT:    s_andn2_b32 s0, exec_lo, s0
119; GFX10-32-NEXT:    s_andn2_b32 s1, s1, s0
120; GFX10-32-NEXT:    s_cbranch_scc0 .LBB1_2
121; GFX10-32-NEXT:  ; %bb.1: ; %.entry
122; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
123; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
124; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
125; GFX10-32-NEXT:    s_endpgm
126; GFX10-32-NEXT:  .LBB1_2:
127; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
128; GFX10-32-NEXT:    exp null off, off, off, off done vm
129; GFX10-32-NEXT:    s_endpgm
130;
131; GFX10-64-LABEL: dynamic_exact:
132; GFX10-64:       ; %bb.0: ; %.entry
133; GFX10-64-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
134; GFX10-64-NEXT:    s_mov_b64 s[2:3], exec
135; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
136; GFX10-64-NEXT:    s_andn2_b64 s[0:1], exec, s[0:1]
137; GFX10-64-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
138; GFX10-64-NEXT:    s_cbranch_scc0 .LBB1_2
139; GFX10-64-NEXT:  ; %bb.1: ; %.entry
140; GFX10-64-NEXT:    s_and_b64 exec, exec, s[2:3]
141; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
142; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
143; GFX10-64-NEXT:    s_endpgm
144; GFX10-64-NEXT:  .LBB1_2:
145; GFX10-64-NEXT:    s_mov_b64 exec, 0
146; GFX10-64-NEXT:    exp null off, off, off, off done vm
147; GFX10-64-NEXT:    s_endpgm
148.entry:
149  %c0 = fcmp olt float %arg0, 0.000000e+00
150  %c1 = fcmp oge float %arg1, 0.0
151  call void @llvm.amdgcn.wqm.demote(i1 %c1)
152  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
153  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
154  ret void
155}
156
157define amdgpu_ps void @branch(float %arg0, float %arg1) {
158; SI-LABEL: branch:
159; SI:       ; %bb.0: ; %.entry
160; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
161; SI-NEXT:    v_cvt_i32_f32_e32 v1, v1
162; SI-NEXT:    s_mov_b64 s[2:3], exec
163; SI-NEXT:    v_or_b32_e32 v0, v0, v1
164; SI-NEXT:    v_and_b32_e32 v0, 1, v0
165; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
166; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
167; SI-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
168; SI-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
169; SI-NEXT:    s_cbranch_execz .LBB2_3
170; SI-NEXT:  ; %bb.1: ; %.demote
171; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
172; SI-NEXT:    s_cbranch_scc0 .LBB2_4
173; SI-NEXT:  ; %bb.2: ; %.demote
174; SI-NEXT:    s_mov_b64 exec, 0
175; SI-NEXT:  .LBB2_3: ; %.continue
176; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
177; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
178; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
179; SI-NEXT:    s_endpgm
180; SI-NEXT:  .LBB2_4:
181; SI-NEXT:    s_mov_b64 exec, 0
182; SI-NEXT:    exp null off, off, off, off done vm
183; SI-NEXT:    s_endpgm
184;
185; GFX9-LABEL: branch:
186; GFX9:       ; %bb.0: ; %.entry
187; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
188; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
189; GFX9-NEXT:    s_mov_b64 s[2:3], exec
190; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
191; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
192; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
193; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
194; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
195; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
196; GFX9-NEXT:    s_cbranch_execz .LBB2_3
197; GFX9-NEXT:  ; %bb.1: ; %.demote
198; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
199; GFX9-NEXT:    s_cbranch_scc0 .LBB2_4
200; GFX9-NEXT:  ; %bb.2: ; %.demote
201; GFX9-NEXT:    s_mov_b64 exec, 0
202; GFX9-NEXT:  .LBB2_3: ; %.continue
203; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
204; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
205; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
206; GFX9-NEXT:    s_endpgm
207; GFX9-NEXT:  .LBB2_4:
208; GFX9-NEXT:    s_mov_b64 exec, 0
209; GFX9-NEXT:    exp null off, off, off, off done vm
210; GFX9-NEXT:    s_endpgm
211;
212; GFX10-32-LABEL: branch:
213; GFX10-32:       ; %bb.0: ; %.entry
214; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
215; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v1, v1
216; GFX10-32-NEXT:    s_mov_b32 s1, exec_lo
217; GFX10-32-NEXT:    v_or_b32_e32 v0, v0, v1
218; GFX10-32-NEXT:    v_and_b32_e32 v0, 1, v0
219; GFX10-32-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
220; GFX10-32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
221; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s0
222; GFX10-32-NEXT:    s_xor_b32 s0, exec_lo, s2
223; GFX10-32-NEXT:    s_cbranch_execz .LBB2_3
224; GFX10-32-NEXT:  ; %bb.1: ; %.demote
225; GFX10-32-NEXT:    s_andn2_b32 s1, s1, exec_lo
226; GFX10-32-NEXT:    s_cbranch_scc0 .LBB2_4
227; GFX10-32-NEXT:  ; %bb.2: ; %.demote
228; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
229; GFX10-32-NEXT:  .LBB2_3: ; %.continue
230; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
231; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
232; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
233; GFX10-32-NEXT:    s_endpgm
234; GFX10-32-NEXT:  .LBB2_4:
235; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
236; GFX10-32-NEXT:    exp null off, off, off, off done vm
237; GFX10-32-NEXT:    s_endpgm
238;
239; GFX10-64-LABEL: branch:
240; GFX10-64:       ; %bb.0: ; %.entry
241; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
242; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v1, v1
243; GFX10-64-NEXT:    s_mov_b64 s[2:3], exec
244; GFX10-64-NEXT:    v_or_b32_e32 v0, v0, v1
245; GFX10-64-NEXT:    v_and_b32_e32 v0, 1, v0
246; GFX10-64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
247; GFX10-64-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
248; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
249; GFX10-64-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
250; GFX10-64-NEXT:    s_cbranch_execz .LBB2_3
251; GFX10-64-NEXT:  ; %bb.1: ; %.demote
252; GFX10-64-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
253; GFX10-64-NEXT:    s_cbranch_scc0 .LBB2_4
254; GFX10-64-NEXT:  ; %bb.2: ; %.demote
255; GFX10-64-NEXT:    s_mov_b64 exec, 0
256; GFX10-64-NEXT:  .LBB2_3: ; %.continue
257; GFX10-64-NEXT:    s_or_b64 exec, exec, s[0:1]
258; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
259; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
260; GFX10-64-NEXT:    s_endpgm
261; GFX10-64-NEXT:  .LBB2_4:
262; GFX10-64-NEXT:    s_mov_b64 exec, 0
263; GFX10-64-NEXT:    exp null off, off, off, off done vm
264; GFX10-64-NEXT:    s_endpgm
265.entry:
266  %i0 = fptosi float %arg0 to i32
267  %i1 = fptosi float %arg1 to i32
268  %c0 = or i32 %i0, %i1
269  %c1 = and i32 %c0, 1
270  %c2 = icmp eq i32 %c1, 0
271  br i1 %c2, label %.continue, label %.demote
272
273.demote:
274  call void @llvm.amdgcn.wqm.demote(i1 false)
275  br label %.continue
276
277.continue:
278  %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00
279  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
280  ret void
281}
282
283
284define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
285; SI-LABEL: wqm_demote_1:
286; SI:       ; %bb.0: ; %.entry
287; SI-NEXT:    s_mov_b64 s[12:13], exec
288; SI-NEXT:    s_wqm_b64 exec, exec
289; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
290; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
291; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
292; SI-NEXT:    s_cbranch_execz .LBB3_3
293; SI-NEXT:  ; %bb.1: ; %.demote
294; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
295; SI-NEXT:    s_cbranch_scc0 .LBB3_4
296; SI-NEXT:  ; %bb.2: ; %.demote
297; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
298; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
299; SI-NEXT:  .LBB3_3: ; %.continue
300; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
301; SI-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
302; SI-NEXT:    s_waitcnt vmcnt(0)
303; SI-NEXT:    v_add_f32_e32 v0, v0, v0
304; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
305; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
306; SI-NEXT:    s_waitcnt vmcnt(0)
307; SI-NEXT:    s_branch .LBB3_5
308; SI-NEXT:  .LBB3_4:
309; SI-NEXT:    s_mov_b64 exec, 0
310; SI-NEXT:    exp null off, off, off, off done vm
311; SI-NEXT:    s_endpgm
312; SI-NEXT:  .LBB3_5:
313;
314; GFX9-LABEL: wqm_demote_1:
315; GFX9:       ; %bb.0: ; %.entry
316; GFX9-NEXT:    s_mov_b64 s[12:13], exec
317; GFX9-NEXT:    s_wqm_b64 exec, exec
318; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
319; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
320; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
321; GFX9-NEXT:    s_cbranch_execz .LBB3_3
322; GFX9-NEXT:  ; %bb.1: ; %.demote
323; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
324; GFX9-NEXT:    s_cbranch_scc0 .LBB3_4
325; GFX9-NEXT:  ; %bb.2: ; %.demote
326; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
327; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
328; GFX9-NEXT:  .LBB3_3: ; %.continue
329; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
330; GFX9-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
331; GFX9-NEXT:    s_waitcnt vmcnt(0)
332; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
333; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
334; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
335; GFX9-NEXT:    s_waitcnt vmcnt(0)
336; GFX9-NEXT:    s_branch .LBB3_5
337; GFX9-NEXT:  .LBB3_4:
338; GFX9-NEXT:    s_mov_b64 exec, 0
339; GFX9-NEXT:    exp null off, off, off, off done vm
340; GFX9-NEXT:    s_endpgm
341; GFX9-NEXT:  .LBB3_5:
342;
343; GFX10-32-LABEL: wqm_demote_1:
344; GFX10-32:       ; %bb.0: ; %.entry
345; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
346; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
347; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v1
348; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
349; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
350; GFX10-32-NEXT:    s_cbranch_execz .LBB3_3
351; GFX10-32-NEXT:  ; %bb.1: ; %.demote
352; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
353; GFX10-32-NEXT:    s_cbranch_scc0 .LBB3_4
354; GFX10-32-NEXT:  ; %bb.2: ; %.demote
355; GFX10-32-NEXT:    s_wqm_b32 s14, s12
356; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s14
357; GFX10-32-NEXT:  .LBB3_3: ; %.continue
358; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
359; GFX10-32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
360; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
361; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
362; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
363; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
364; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
365; GFX10-32-NEXT:    s_branch .LBB3_5
366; GFX10-32-NEXT:  .LBB3_4:
367; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
368; GFX10-32-NEXT:    exp null off, off, off, off done vm
369; GFX10-32-NEXT:    s_endpgm
370; GFX10-32-NEXT:  .LBB3_5:
371;
372; GFX10-64-LABEL: wqm_demote_1:
373; GFX10-64:       ; %bb.0: ; %.entry
374; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
375; GFX10-64-NEXT:    s_wqm_b64 exec, exec
376; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
377; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
378; GFX10-64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
379; GFX10-64-NEXT:    s_cbranch_execz .LBB3_3
380; GFX10-64-NEXT:  ; %bb.1: ; %.demote
381; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
382; GFX10-64-NEXT:    s_cbranch_scc0 .LBB3_4
383; GFX10-64-NEXT:  ; %bb.2: ; %.demote
384; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
385; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
386; GFX10-64-NEXT:  .LBB3_3: ; %.continue
387; GFX10-64-NEXT:    s_or_b64 exec, exec, s[14:15]
388; GFX10-64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
389; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
390; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
391; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
392; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
393; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
394; GFX10-64-NEXT:    s_branch .LBB3_5
395; GFX10-64-NEXT:  .LBB3_4:
396; GFX10-64-NEXT:    s_mov_b64 exec, 0
397; GFX10-64-NEXT:    exp null off, off, off, off done vm
398; GFX10-64-NEXT:    s_endpgm
399; GFX10-64-NEXT:  .LBB3_5:
400.entry:
401  %z.cmp = fcmp olt float %z, 0.0
402  br i1 %z.cmp, label %.continue, label %.demote
403
404.demote:
405  call void @llvm.amdgcn.wqm.demote(i1 false)
406  br label %.continue
407
408.continue:
409  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
410  %tex0 = extractelement <4 x float> %tex, i32 0
411  %tex1 = extractelement <4 x float> %tex, i32 0
412  %coord1 = fadd float %tex0, %tex1
413  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
414
415  ret <4 x float> %rtex
416}
417
418define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
419; SI-LABEL: wqm_demote_2:
420; SI:       ; %bb.0: ; %.entry
421; SI-NEXT:    s_mov_b64 s[12:13], exec
422; SI-NEXT:    s_wqm_b64 exec, exec
423; SI-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
424; SI-NEXT:    s_waitcnt vmcnt(0)
425; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
426; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
427; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
428; SI-NEXT:    s_cbranch_execz .LBB4_3
429; SI-NEXT:  ; %bb.1: ; %.demote
430; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
431; SI-NEXT:    s_cbranch_scc0 .LBB4_4
432; SI-NEXT:  ; %bb.2: ; %.demote
433; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
434; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
435; SI-NEXT:  .LBB4_3: ; %.continue
436; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
437; SI-NEXT:    v_add_f32_e32 v0, v0, v0
438; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
439; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
440; SI-NEXT:    s_waitcnt vmcnt(0)
441; SI-NEXT:    s_branch .LBB4_5
442; SI-NEXT:  .LBB4_4:
443; SI-NEXT:    s_mov_b64 exec, 0
444; SI-NEXT:    exp null off, off, off, off done vm
445; SI-NEXT:    s_endpgm
446; SI-NEXT:  .LBB4_5:
447;
448; GFX9-LABEL: wqm_demote_2:
449; GFX9:       ; %bb.0: ; %.entry
450; GFX9-NEXT:    s_mov_b64 s[12:13], exec
451; GFX9-NEXT:    s_wqm_b64 exec, exec
452; GFX9-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
453; GFX9-NEXT:    s_waitcnt vmcnt(0)
454; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
455; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
456; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
457; GFX9-NEXT:    s_cbranch_execz .LBB4_3
458; GFX9-NEXT:  ; %bb.1: ; %.demote
459; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
460; GFX9-NEXT:    s_cbranch_scc0 .LBB4_4
461; GFX9-NEXT:  ; %bb.2: ; %.demote
462; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
463; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
464; GFX9-NEXT:  .LBB4_3: ; %.continue
465; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
466; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
467; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
468; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
469; GFX9-NEXT:    s_waitcnt vmcnt(0)
470; GFX9-NEXT:    s_branch .LBB4_5
471; GFX9-NEXT:  .LBB4_4:
472; GFX9-NEXT:    s_mov_b64 exec, 0
473; GFX9-NEXT:    exp null off, off, off, off done vm
474; GFX9-NEXT:    s_endpgm
475; GFX9-NEXT:  .LBB4_5:
476;
477; GFX10-32-LABEL: wqm_demote_2:
478; GFX10-32:       ; %bb.0: ; %.entry
479; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
480; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
481; GFX10-32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
482; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
483; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v0
484; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
485; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
486; GFX10-32-NEXT:    s_cbranch_execz .LBB4_3
487; GFX10-32-NEXT:  ; %bb.1: ; %.demote
488; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
489; GFX10-32-NEXT:    s_cbranch_scc0 .LBB4_4
490; GFX10-32-NEXT:  ; %bb.2: ; %.demote
491; GFX10-32-NEXT:    s_wqm_b32 s14, s12
492; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s14
493; GFX10-32-NEXT:  .LBB4_3: ; %.continue
494; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
495; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
496; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
497; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
498; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
499; GFX10-32-NEXT:    s_branch .LBB4_5
500; GFX10-32-NEXT:  .LBB4_4:
501; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
502; GFX10-32-NEXT:    exp null off, off, off, off done vm
503; GFX10-32-NEXT:    s_endpgm
504; GFX10-32-NEXT:  .LBB4_5:
505;
506; GFX10-64-LABEL: wqm_demote_2:
507; GFX10-64:       ; %bb.0: ; %.entry
508; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
509; GFX10-64-NEXT:    s_wqm_b64 exec, exec
510; GFX10-64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
511; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
512; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
513; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
514; GFX10-64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
515; GFX10-64-NEXT:    s_cbranch_execz .LBB4_3
516; GFX10-64-NEXT:  ; %bb.1: ; %.demote
517; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
518; GFX10-64-NEXT:    s_cbranch_scc0 .LBB4_4
519; GFX10-64-NEXT:  ; %bb.2: ; %.demote
520; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
521; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
522; GFX10-64-NEXT:  .LBB4_3: ; %.continue
523; GFX10-64-NEXT:    s_or_b64 exec, exec, s[14:15]
524; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
525; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
526; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
527; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
528; GFX10-64-NEXT:    s_branch .LBB4_5
529; GFX10-64-NEXT:  .LBB4_4:
530; GFX10-64-NEXT:    s_mov_b64 exec, 0
531; GFX10-64-NEXT:    exp null off, off, off, off done vm
532; GFX10-64-NEXT:    s_endpgm
533; GFX10-64-NEXT:  .LBB4_5:
534.entry:
535  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
536  %tex0 = extractelement <4 x float> %tex, i32 0
537  %tex1 = extractelement <4 x float> %tex, i32 0
538  %z.cmp = fcmp olt float %tex0, 0.0
539  br i1 %z.cmp, label %.continue, label %.demote
540
541.demote:
542  call void @llvm.amdgcn.wqm.demote(i1 false)
543  br label %.continue
544
545.continue:
546  %coord1 = fadd float %tex0, %tex1
547  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
548
549  ret <4 x float> %rtex
550}
551
552define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
553; SI-LABEL: wqm_demote_dynamic:
554; SI:       ; %bb.0: ; %.entry
555; SI-NEXT:    s_mov_b64 s[12:13], exec
556; SI-NEXT:    s_wqm_b64 exec, exec
557; SI-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
558; SI-NEXT:    s_waitcnt vmcnt(0)
559; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
560; SI-NEXT:    s_andn2_b64 s[14:15], exec, vcc
561; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
562; SI-NEXT:    s_cbranch_scc0 .LBB5_2
563; SI-NEXT:  ; %bb.1: ; %.entry
564; SI-NEXT:    s_wqm_b64 s[14:15], s[12:13]
565; SI-NEXT:    s_and_b64 exec, exec, s[14:15]
566; SI-NEXT:    v_add_f32_e32 v0, v0, v0
567; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
568; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
569; SI-NEXT:    s_waitcnt vmcnt(0)
570; SI-NEXT:    s_branch .LBB5_3
571; SI-NEXT:  .LBB5_2:
572; SI-NEXT:    s_mov_b64 exec, 0
573; SI-NEXT:    exp null off, off, off, off done vm
574; SI-NEXT:    s_endpgm
575; SI-NEXT:  .LBB5_3:
576;
577; GFX9-LABEL: wqm_demote_dynamic:
578; GFX9:       ; %bb.0: ; %.entry
579; GFX9-NEXT:    s_mov_b64 s[12:13], exec
580; GFX9-NEXT:    s_wqm_b64 exec, exec
581; GFX9-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
582; GFX9-NEXT:    s_waitcnt vmcnt(0)
583; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
584; GFX9-NEXT:    s_andn2_b64 s[14:15], exec, vcc
585; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
586; GFX9-NEXT:    s_cbranch_scc0 .LBB5_2
587; GFX9-NEXT:  ; %bb.1: ; %.entry
588; GFX9-NEXT:    s_wqm_b64 s[14:15], s[12:13]
589; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
590; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
591; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
592; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
593; GFX9-NEXT:    s_waitcnt vmcnt(0)
594; GFX9-NEXT:    s_branch .LBB5_3
595; GFX9-NEXT:  .LBB5_2:
596; GFX9-NEXT:    s_mov_b64 exec, 0
597; GFX9-NEXT:    exp null off, off, off, off done vm
598; GFX9-NEXT:    s_endpgm
599; GFX9-NEXT:  .LBB5_3:
600;
601; GFX10-32-LABEL: wqm_demote_dynamic:
602; GFX10-32:       ; %bb.0: ; %.entry
603; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
604; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
605; GFX10-32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
606; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
607; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
608; GFX10-32-NEXT:    s_andn2_b32 s13, exec_lo, vcc_lo
609; GFX10-32-NEXT:    s_andn2_b32 s12, s12, s13
610; GFX10-32-NEXT:    s_cbranch_scc0 .LBB5_2
611; GFX10-32-NEXT:  ; %bb.1: ; %.entry
612; GFX10-32-NEXT:    s_wqm_b32 s13, s12
613; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s13
614; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
615; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
616; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
617; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
618; GFX10-32-NEXT:    s_branch .LBB5_3
619; GFX10-32-NEXT:  .LBB5_2:
620; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
621; GFX10-32-NEXT:    exp null off, off, off, off done vm
622; GFX10-32-NEXT:    s_endpgm
623; GFX10-32-NEXT:  .LBB5_3:
624;
625; GFX10-64-LABEL: wqm_demote_dynamic:
626; GFX10-64:       ; %bb.0: ; %.entry
627; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
628; GFX10-64-NEXT:    s_wqm_b64 exec, exec
629; GFX10-64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
630; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
631; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
632; GFX10-64-NEXT:    s_andn2_b64 s[14:15], exec, vcc
633; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
634; GFX10-64-NEXT:    s_cbranch_scc0 .LBB5_2
635; GFX10-64-NEXT:  ; %bb.1: ; %.entry
636; GFX10-64-NEXT:    s_wqm_b64 s[14:15], s[12:13]
637; GFX10-64-NEXT:    s_and_b64 exec, exec, s[14:15]
638; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
639; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
640; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
641; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
642; GFX10-64-NEXT:    s_branch .LBB5_3
643; GFX10-64-NEXT:  .LBB5_2:
644; GFX10-64-NEXT:    s_mov_b64 exec, 0
645; GFX10-64-NEXT:    exp null off, off, off, off done vm
646; GFX10-64-NEXT:    s_endpgm
647; GFX10-64-NEXT:  .LBB5_3:
648.entry:
649  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
650  %tex0 = extractelement <4 x float> %tex, i32 0
651  %tex1 = extractelement <4 x float> %tex, i32 0
652  %z.cmp = fcmp olt float %tex0, 0.0
653  call void @llvm.amdgcn.wqm.demote(i1 %z.cmp)
654  %coord1 = fadd float %tex0, %tex1
655  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
656
657  ret <4 x float> %rtex
658}
659
660
661define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
662; SI-LABEL: wqm_deriv:
663; SI:       ; %bb.0: ; %.entry
664; SI-NEXT:    s_mov_b64 s[0:1], exec
665; SI-NEXT:    s_wqm_b64 exec, exec
666; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
667; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
668; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
669; SI-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
670; SI-NEXT:    s_cbranch_execz .LBB6_3
671; SI-NEXT:  ; %bb.1: ; %.demote0
672; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
673; SI-NEXT:    s_cbranch_scc0 .LBB6_7
674; SI-NEXT:  ; %bb.2: ; %.demote0
675; SI-NEXT:    s_wqm_b64 s[4:5], s[0:1]
676; SI-NEXT:    s_and_b64 exec, exec, s[4:5]
677; SI-NEXT:  .LBB6_3: ; %.continue0
678; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
679; SI-NEXT:    s_mov_b64 s[2:3], s[0:1]
680; SI-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
681; SI-NEXT:    v_mov_b32_e32 v1, v0
682; SI-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
683; SI-NEXT:    s_nop 0
684; SI-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
685; SI-NEXT:    s_nop 1
686; SI-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
687; SI-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
688; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
689; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
690; SI-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
691; SI-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
692; SI-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
693; SI-NEXT:    s_cbranch_execz .LBB6_6
694; SI-NEXT:  ; %bb.4: ; %.demote1
695; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
696; SI-NEXT:    s_cbranch_scc0 .LBB6_7
697; SI-NEXT:  ; %bb.5: ; %.demote1
698; SI-NEXT:    s_mov_b64 exec, 0
699; SI-NEXT:  .LBB6_6: ; %.continue1
700; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
701; SI-NEXT:    v_bfrev_b32_e32 v0, 60
702; SI-NEXT:    v_mov_b32_e32 v1, 0x3c00
703; SI-NEXT:    exp mrt0 v1, v1, v0, v0 done compr vm
704; SI-NEXT:    s_endpgm
705; SI-NEXT:  .LBB6_7:
706; SI-NEXT:    s_mov_b64 exec, 0
707; SI-NEXT:    exp null off, off, off, off done vm
708; SI-NEXT:    s_endpgm
709;
710; GFX9-LABEL: wqm_deriv:
711; GFX9:       ; %bb.0: ; %.entry
712; GFX9-NEXT:    s_mov_b64 s[0:1], exec
713; GFX9-NEXT:    s_wqm_b64 exec, exec
714; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
715; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
716; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
717; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
718; GFX9-NEXT:    s_cbranch_execz .LBB6_3
719; GFX9-NEXT:  ; %bb.1: ; %.demote0
720; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
721; GFX9-NEXT:    s_cbranch_scc0 .LBB6_7
722; GFX9-NEXT:  ; %bb.2: ; %.demote0
723; GFX9-NEXT:    s_wqm_b64 s[4:5], s[0:1]
724; GFX9-NEXT:    s_and_b64 exec, exec, s[4:5]
725; GFX9-NEXT:  .LBB6_3: ; %.continue0
726; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
727; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
728; GFX9-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
729; GFX9-NEXT:    v_mov_b32_e32 v1, v0
730; GFX9-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
731; GFX9-NEXT:    s_nop 0
732; GFX9-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
733; GFX9-NEXT:    s_nop 1
734; GFX9-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
735; GFX9-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
736; GFX9-NEXT:    s_and_b64 exec, exec, s[0:1]
737; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
738; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
739; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
740; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
741; GFX9-NEXT:    s_cbranch_execz .LBB6_6
742; GFX9-NEXT:  ; %bb.4: ; %.demote1
743; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
744; GFX9-NEXT:    s_cbranch_scc0 .LBB6_7
745; GFX9-NEXT:  ; %bb.5: ; %.demote1
746; GFX9-NEXT:    s_mov_b64 exec, 0
747; GFX9-NEXT:  .LBB6_6: ; %.continue1
748; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
749; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
750; GFX9-NEXT:    v_bfrev_b32_e32 v1, 60
751; GFX9-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
752; GFX9-NEXT:    s_endpgm
753; GFX9-NEXT:  .LBB6_7:
754; GFX9-NEXT:    s_mov_b64 exec, 0
755; GFX9-NEXT:    exp null off, off, off, off done vm
756; GFX9-NEXT:    s_endpgm
757;
758; GFX10-32-LABEL: wqm_deriv:
759; GFX10-32:       ; %bb.0: ; %.entry
760; GFX10-32-NEXT:    s_mov_b32 s0, exec_lo
761; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
762; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
763; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
764; GFX10-32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
765; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s1
766; GFX10-32-NEXT:    s_cbranch_execz .LBB6_3
767; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
768; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
769; GFX10-32-NEXT:    s_cbranch_scc0 .LBB6_7
770; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
771; GFX10-32-NEXT:    s_wqm_b32 s2, s0
772; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
773; GFX10-32-NEXT:  .LBB6_3: ; %.continue0
774; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
775; GFX10-32-NEXT:    s_mov_b32 s1, s0
776; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s1
777; GFX10-32-NEXT:    v_mov_b32_e32 v1, v0
778; GFX10-32-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
779; GFX10-32-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
780; GFX10-32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
781; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
782; GFX10-32-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v0
783; GFX10-32-NEXT:    s_xor_b32 s1, s0, -1
784; GFX10-32-NEXT:    s_or_b32 s1, s1, vcc_lo
785; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s1
786; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s2
787; GFX10-32-NEXT:    s_cbranch_execz .LBB6_6
788; GFX10-32-NEXT:  ; %bb.4: ; %.demote1
789; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
790; GFX10-32-NEXT:    s_cbranch_scc0 .LBB6_7
791; GFX10-32-NEXT:  ; %bb.5: ; %.demote1
792; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
793; GFX10-32-NEXT:  .LBB6_6: ; %.continue1
794; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
795; GFX10-32-NEXT:    v_mov_b32_e32 v0, 0x3c00
796; GFX10-32-NEXT:    v_bfrev_b32_e32 v1, 60
797; GFX10-32-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
798; GFX10-32-NEXT:    s_endpgm
799; GFX10-32-NEXT:  .LBB6_7:
800; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
801; GFX10-32-NEXT:    exp null off, off, off, off done vm
802; GFX10-32-NEXT:    s_endpgm
803;
804; GFX10-64-LABEL: wqm_deriv:
805; GFX10-64:       ; %bb.0: ; %.entry
806; GFX10-64-NEXT:    s_mov_b64 s[0:1], exec
807; GFX10-64-NEXT:    s_wqm_b64 exec, exec
808; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
809; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
810; GFX10-64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
811; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
812; GFX10-64-NEXT:    s_cbranch_execz .LBB6_3
813; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
814; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
815; GFX10-64-NEXT:    s_cbranch_scc0 .LBB6_7
816; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
817; GFX10-64-NEXT:    s_wqm_b64 s[4:5], s[0:1]
818; GFX10-64-NEXT:    s_and_b64 exec, exec, s[4:5]
819; GFX10-64-NEXT:  .LBB6_3: ; %.continue0
820; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
821; GFX10-64-NEXT:    s_mov_b64 s[2:3], s[0:1]
822; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
823; GFX10-64-NEXT:    v_mov_b32_e32 v1, v0
824; GFX10-64-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
825; GFX10-64-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
826; GFX10-64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
827; GFX10-64-NEXT:    s_and_b64 exec, exec, s[0:1]
828; GFX10-64-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
829; GFX10-64-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
830; GFX10-64-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
831; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
832; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
833; GFX10-64-NEXT:    s_cbranch_execz .LBB6_6
834; GFX10-64-NEXT:  ; %bb.4: ; %.demote1
835; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
836; GFX10-64-NEXT:    s_cbranch_scc0 .LBB6_7
837; GFX10-64-NEXT:  ; %bb.5: ; %.demote1
838; GFX10-64-NEXT:    s_mov_b64 exec, 0
839; GFX10-64-NEXT:  .LBB6_6: ; %.continue1
840; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
841; GFX10-64-NEXT:    v_mov_b32_e32 v0, 0x3c00
842; GFX10-64-NEXT:    v_bfrev_b32_e32 v1, 60
843; GFX10-64-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
844; GFX10-64-NEXT:    s_endpgm
845; GFX10-64-NEXT:  .LBB6_7:
846; GFX10-64-NEXT:    s_mov_b64 exec, 0
847; GFX10-64-NEXT:    exp null off, off, off, off done vm
848; GFX10-64-NEXT:    s_endpgm
849.entry:
850  %p0 = extractelement <2 x float> %input, i32 0
851  %p1 = extractelement <2 x float> %input, i32 1
852  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 0, i32 0, i32 %index) #2
853  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 0, i32 0, i32 %index) #2
854  %argi = fptosi float %arg to i32
855  %cond0 = icmp eq i32 %argi, 0
856  br i1 %cond0, label %.continue0, label %.demote0
857
858.demote0:
859  call void @llvm.amdgcn.wqm.demote(i1 false)
860  br label %.continue0
861
862.continue0:
863  %live = call i1 @llvm.amdgcn.live.mask()
864  %live.cond = select i1 %live, i32 0, i32 1065353216
865  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
866  %live.v0f = bitcast i32 %live.v0 to float
867  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
868  %live.v1f = bitcast i32 %live.v1 to float
869  %v0 = fsub float %live.v0f, %live.v1f
870  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
871  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
872  %cond2 = and i1 %live, %cond1
873  br i1 %cond2, label %.continue1, label %.demote1
874
875.demote1:
876  call void @llvm.amdgcn.wqm.demote(i1 false)
877  br label %.continue1
878
879.continue1:
880  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 true, i1 true) #3
881  ret void
882}
883
884define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) {
885; SI-LABEL: wqm_deriv_loop:
886; SI:       ; %bb.0: ; %.entry
887; SI-NEXT:    s_mov_b64 s[0:1], exec
888; SI-NEXT:    s_wqm_b64 exec, exec
889; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
890; SI-NEXT:    s_mov_b32 s6, 0
891; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
892; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
893; SI-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
894; SI-NEXT:    s_cbranch_execz .LBB7_3
895; SI-NEXT:  ; %bb.1: ; %.demote0
896; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
897; SI-NEXT:    s_cbranch_scc0 .LBB7_9
898; SI-NEXT:  ; %bb.2: ; %.demote0
899; SI-NEXT:    s_wqm_b64 s[4:5], s[0:1]
900; SI-NEXT:    s_and_b64 exec, exec, s[4:5]
901; SI-NEXT:  .LBB7_3: ; %.continue0.preheader
902; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
903; SI-NEXT:    s_mov_b64 s[2:3], 0
904; SI-NEXT:    s_branch .LBB7_5
905; SI-NEXT:  .LBB7_4: ; %.continue1
906; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
907; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
908; SI-NEXT:    s_add_i32 s6, s6, 1
909; SI-NEXT:    v_cmp_ge_i32_e32 vcc, s6, v1
910; SI-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
911; SI-NEXT:    s_andn2_b64 exec, exec, s[2:3]
912; SI-NEXT:    s_cbranch_execz .LBB7_8
913; SI-NEXT:  .LBB7_5: ; %.continue0
914; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
915; SI-NEXT:    v_mov_b32_e32 v0, s6
916; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
917; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
918; SI-NEXT:    v_mov_b32_e32 v2, v0
919; SI-NEXT:    s_xor_b64 s[4:5], s[0:1], -1
920; SI-NEXT:    s_nop 0
921; SI-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
922; SI-NEXT:    s_nop 1
923; SI-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
924; SI-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
925; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
926; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
927; SI-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
928; SI-NEXT:    s_xor_b64 s[4:5], exec, s[8:9]
929; SI-NEXT:    s_cbranch_execz .LBB7_4
930; SI-NEXT:  ; %bb.6: ; %.demote1
931; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
932; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
933; SI-NEXT:    s_cbranch_scc0 .LBB7_9
934; SI-NEXT:  ; %bb.7: ; %.demote1
935; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
936; SI-NEXT:    s_wqm_b64 s[8:9], s[0:1]
937; SI-NEXT:    s_and_b64 exec, exec, s[8:9]
938; SI-NEXT:    s_branch .LBB7_4
939; SI-NEXT:  .LBB7_8: ; %.return
940; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
941; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
942; SI-NEXT:    v_bfrev_b32_e32 v0, 60
943; SI-NEXT:    v_mov_b32_e32 v1, 0x3c00
944; SI-NEXT:    exp mrt0 v1, v1, v0, v0 done compr vm
945; SI-NEXT:    s_endpgm
946; SI-NEXT:  .LBB7_9:
947; SI-NEXT:    s_mov_b64 exec, 0
948; SI-NEXT:    exp null off, off, off, off done vm
949; SI-NEXT:    s_endpgm
950;
951; GFX9-LABEL: wqm_deriv_loop:
952; GFX9:       ; %bb.0: ; %.entry
953; GFX9-NEXT:    s_mov_b64 s[0:1], exec
954; GFX9-NEXT:    s_wqm_b64 exec, exec
955; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
956; GFX9-NEXT:    s_mov_b32 s6, 0
957; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
958; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
959; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
960; GFX9-NEXT:    s_cbranch_execz .LBB7_3
961; GFX9-NEXT:  ; %bb.1: ; %.demote0
962; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
963; GFX9-NEXT:    s_cbranch_scc0 .LBB7_9
964; GFX9-NEXT:  ; %bb.2: ; %.demote0
965; GFX9-NEXT:    s_wqm_b64 s[4:5], s[0:1]
966; GFX9-NEXT:    s_and_b64 exec, exec, s[4:5]
967; GFX9-NEXT:  .LBB7_3: ; %.continue0.preheader
968; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
969; GFX9-NEXT:    s_mov_b64 s[2:3], 0
970; GFX9-NEXT:    s_branch .LBB7_5
971; GFX9-NEXT:  .LBB7_4: ; %.continue1
972; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
973; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
974; GFX9-NEXT:    s_add_i32 s6, s6, 1
975; GFX9-NEXT:    v_cmp_ge_i32_e32 vcc, s6, v1
976; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
977; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
978; GFX9-NEXT:    s_cbranch_execz .LBB7_8
979; GFX9-NEXT:  .LBB7_5: ; %.continue0
980; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
981; GFX9-NEXT:    v_mov_b32_e32 v0, s6
982; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
983; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
984; GFX9-NEXT:    v_mov_b32_e32 v2, v0
985; GFX9-NEXT:    s_xor_b64 s[4:5], s[0:1], -1
986; GFX9-NEXT:    s_nop 0
987; GFX9-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
988; GFX9-NEXT:    s_nop 1
989; GFX9-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
990; GFX9-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
991; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
992; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
993; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
994; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[8:9]
995; GFX9-NEXT:    s_cbranch_execz .LBB7_4
996; GFX9-NEXT:  ; %bb.6: ; %.demote1
997; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
998; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
999; GFX9-NEXT:    s_cbranch_scc0 .LBB7_9
1000; GFX9-NEXT:  ; %bb.7: ; %.demote1
1001; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1002; GFX9-NEXT:    s_wqm_b64 s[8:9], s[0:1]
1003; GFX9-NEXT:    s_and_b64 exec, exec, s[8:9]
1004; GFX9-NEXT:    s_branch .LBB7_4
1005; GFX9-NEXT:  .LBB7_8: ; %.return
1006; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
1007; GFX9-NEXT:    s_and_b64 exec, exec, s[0:1]
1008; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
1009; GFX9-NEXT:    v_bfrev_b32_e32 v1, 60
1010; GFX9-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
1011; GFX9-NEXT:    s_endpgm
1012; GFX9-NEXT:  .LBB7_9:
1013; GFX9-NEXT:    s_mov_b64 exec, 0
1014; GFX9-NEXT:    exp null off, off, off, off done vm
1015; GFX9-NEXT:    s_endpgm
1016;
1017; GFX10-32-LABEL: wqm_deriv_loop:
1018; GFX10-32:       ; %bb.0: ; %.entry
1019; GFX10-32-NEXT:    s_mov_b32 s0, exec_lo
1020; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1021; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
1022; GFX10-32-NEXT:    s_mov_b32 s1, 0
1023; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
1024; GFX10-32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1025; GFX10-32-NEXT:    s_xor_b32 s2, exec_lo, s2
1026; GFX10-32-NEXT:    s_cbranch_execz .LBB7_3
1027; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
1028; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
1029; GFX10-32-NEXT:    s_cbranch_scc0 .LBB7_9
1030; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
1031; GFX10-32-NEXT:    s_wqm_b32 s3, s0
1032; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s3
1033; GFX10-32-NEXT:  .LBB7_3: ; %.continue0.preheader
1034; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1035; GFX10-32-NEXT:    s_mov_b32 s2, 0
1036; GFX10-32-NEXT:    s_branch .LBB7_5
1037; GFX10-32-NEXT:  .LBB7_4: ; %.continue1
1038; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1039; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
1040; GFX10-32-NEXT:    s_add_i32 s2, s2, 1
1041; GFX10-32-NEXT:    v_cmp_ge_i32_e32 vcc_lo, s2, v1
1042; GFX10-32-NEXT:    s_or_b32 s1, vcc_lo, s1
1043; GFX10-32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s1
1044; GFX10-32-NEXT:    s_cbranch_execz .LBB7_8
1045; GFX10-32-NEXT:  .LBB7_5: ; %.continue0
1046; GFX10-32-NEXT:    ; =>This Inner Loop Header: Depth=1
1047; GFX10-32-NEXT:    s_mov_b32 s3, s0
1048; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, s2, 0, s3
1049; GFX10-32-NEXT:    s_xor_b32 s3, s0, -1
1050; GFX10-32-NEXT:    v_mov_b32_e32 v2, v0
1051; GFX10-32-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
1052; GFX10-32-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
1053; GFX10-32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
1054; GFX10-32-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v0
1055; GFX10-32-NEXT:    s_or_b32 s3, s3, vcc_lo
1056; GFX10-32-NEXT:    s_and_saveexec_b32 s4, s3
1057; GFX10-32-NEXT:    s_xor_b32 s3, exec_lo, s4
1058; GFX10-32-NEXT:    s_cbranch_execz .LBB7_4
1059; GFX10-32-NEXT:  ; %bb.6: ; %.demote1
1060; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1061; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
1062; GFX10-32-NEXT:    s_cbranch_scc0 .LBB7_9
1063; GFX10-32-NEXT:  ; %bb.7: ; %.demote1
1064; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1065; GFX10-32-NEXT:    s_wqm_b32 s4, s0
1066; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s4
1067; GFX10-32-NEXT:    s_branch .LBB7_4
1068; GFX10-32-NEXT:  .LBB7_8: ; %.return
1069; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1070; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
1071; GFX10-32-NEXT:    v_mov_b32_e32 v0, 0x3c00
1072; GFX10-32-NEXT:    v_bfrev_b32_e32 v1, 60
1073; GFX10-32-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
1074; GFX10-32-NEXT:    s_endpgm
1075; GFX10-32-NEXT:  .LBB7_9:
1076; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
1077; GFX10-32-NEXT:    exp null off, off, off, off done vm
1078; GFX10-32-NEXT:    s_endpgm
1079;
1080; GFX10-64-LABEL: wqm_deriv_loop:
1081; GFX10-64:       ; %bb.0: ; %.entry
1082; GFX10-64-NEXT:    s_mov_b64 s[0:1], exec
1083; GFX10-64-NEXT:    s_wqm_b64 exec, exec
1084; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
1085; GFX10-64-NEXT:    s_mov_b32 s6, 0
1086; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1087; GFX10-64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1088; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
1089; GFX10-64-NEXT:    s_cbranch_execz .LBB7_3
1090; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
1091; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
1092; GFX10-64-NEXT:    s_cbranch_scc0 .LBB7_9
1093; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
1094; GFX10-64-NEXT:    s_wqm_b64 s[4:5], s[0:1]
1095; GFX10-64-NEXT:    s_and_b64 exec, exec, s[4:5]
1096; GFX10-64-NEXT:  .LBB7_3: ; %.continue0.preheader
1097; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
1098; GFX10-64-NEXT:    s_mov_b64 s[2:3], 0
1099; GFX10-64-NEXT:    s_branch .LBB7_5
1100; GFX10-64-NEXT:  .LBB7_4: ; %.continue1
1101; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1102; GFX10-64-NEXT:    s_or_b64 exec, exec, s[4:5]
1103; GFX10-64-NEXT:    s_add_i32 s6, s6, 1
1104; GFX10-64-NEXT:    v_cmp_ge_i32_e32 vcc, s6, v1
1105; GFX10-64-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
1106; GFX10-64-NEXT:    s_andn2_b64 exec, exec, s[2:3]
1107; GFX10-64-NEXT:    s_cbranch_execz .LBB7_8
1108; GFX10-64-NEXT:  .LBB7_5: ; %.continue0
1109; GFX10-64-NEXT:    ; =>This Inner Loop Header: Depth=1
1110; GFX10-64-NEXT:    s_mov_b64 s[4:5], s[0:1]
1111; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, s6, 0, s[4:5]
1112; GFX10-64-NEXT:    s_xor_b64 s[4:5], s[0:1], -1
1113; GFX10-64-NEXT:    v_mov_b32_e32 v2, v0
1114; GFX10-64-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
1115; GFX10-64-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
1116; GFX10-64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
1117; GFX10-64-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
1118; GFX10-64-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
1119; GFX10-64-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
1120; GFX10-64-NEXT:    s_xor_b64 s[4:5], exec, s[8:9]
1121; GFX10-64-NEXT:    s_cbranch_execz .LBB7_4
1122; GFX10-64-NEXT:  ; %bb.6: ; %.demote1
1123; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1124; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
1125; GFX10-64-NEXT:    s_cbranch_scc0 .LBB7_9
1126; GFX10-64-NEXT:  ; %bb.7: ; %.demote1
1127; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1128; GFX10-64-NEXT:    s_wqm_b64 s[8:9], s[0:1]
1129; GFX10-64-NEXT:    s_and_b64 exec, exec, s[8:9]
1130; GFX10-64-NEXT:    s_branch .LBB7_4
1131; GFX10-64-NEXT:  .LBB7_8: ; %.return
1132; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
1133; GFX10-64-NEXT:    s_and_b64 exec, exec, s[0:1]
1134; GFX10-64-NEXT:    v_mov_b32_e32 v0, 0x3c00
1135; GFX10-64-NEXT:    v_bfrev_b32_e32 v1, 60
1136; GFX10-64-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
1137; GFX10-64-NEXT:    s_endpgm
1138; GFX10-64-NEXT:  .LBB7_9:
1139; GFX10-64-NEXT:    s_mov_b64 exec, 0
1140; GFX10-64-NEXT:    exp null off, off, off, off done vm
1141; GFX10-64-NEXT:    s_endpgm
1142.entry:
1143  %p0 = extractelement <2 x float> %input, i32 0
1144  %p1 = extractelement <2 x float> %input, i32 1
1145  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 0, i32 0, i32 %index) #2
1146  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 0, i32 0, i32 %index) #2
1147  %argi = fptosi float %arg to i32
1148  %cond0 = icmp eq i32 %argi, 0
1149  br i1 %cond0, label %.continue0, label %.demote0
1150
1151.demote0:
1152  call void @llvm.amdgcn.wqm.demote(i1 false)
1153  br label %.continue0
1154
1155.continue0:
1156  %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ]
1157  %live = call i1 @llvm.amdgcn.live.mask()
1158  %live.cond = select i1 %live, i32 0, i32 %count
1159  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
1160  %live.v0f = bitcast i32 %live.v0 to float
1161  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
1162  %live.v1f = bitcast i32 %live.v1 to float
1163  %v0 = fsub float %live.v0f, %live.v1f
1164  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
1165  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
1166  %cond2 = and i1 %live, %cond1
1167  br i1 %cond2, label %.continue1, label %.demote1
1168
1169.demote1:
1170  call void @llvm.amdgcn.wqm.demote(i1 false)
1171  br label %.continue1
1172
1173.continue1:
1174  %next = add i32 %count, 1
1175  %loop.cond = icmp slt i32 %next, %limit
1176  br i1 %loop.cond, label %.continue0, label %.return
1177
1178.return:
1179  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 true, i1 true) #3
1180  ret void
1181}
1182
1183define amdgpu_ps void @static_exact_nop(float %arg0, float %arg1) {
1184; SI-LABEL: static_exact_nop:
1185; SI:       ; %bb.0: ; %.entry
1186; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
1187; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
1188; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
1189; SI-NEXT:    s_endpgm
1190;
1191; GFX9-LABEL: static_exact_nop:
1192; GFX9:       ; %bb.0: ; %.entry
1193; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
1194; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
1195; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
1196; GFX9-NEXT:    s_endpgm
1197;
1198; GFX10-32-LABEL: static_exact_nop:
1199; GFX10-32:       ; %bb.0: ; %.entry
1200; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
1201; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
1202; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
1203; GFX10-32-NEXT:    s_endpgm
1204;
1205; GFX10-64-LABEL: static_exact_nop:
1206; GFX10-64:       ; %bb.0: ; %.entry
1207; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
1208; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
1209; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
1210; GFX10-64-NEXT:    s_endpgm
1211.entry:
1212  %c0 = fcmp olt float %arg0, 0.000000e+00
1213  %c1 = fcmp oge float %arg1, 0.0
1214  call void @llvm.amdgcn.wqm.demote(i1 true)
1215  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
1216  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
1217  ret void
1218}
1219
1220
1221declare void @llvm.amdgcn.wqm.demote(i1) #0
1222declare i1 @llvm.amdgcn.live.mask() #0
1223declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
1224declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
1225declare float @llvm.amdgcn.wqm.f32(float) #1
1226declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2
1227declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2
1228declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3
1229declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4
1230
1231attributes #0 = { nounwind }
1232attributes #1 = { nounwind readnone }
1233attributes #2 = { nounwind readnone speculatable }
1234attributes #3 = { inaccessiblememonly nounwind }
1235attributes #4 = { convergent nounwind readnone }
1236