xref: /llvm-project/llvm/test/CodeGen/AMDGPU/v_cndmask.ll (revision 8a0c2e75678a4d1d479676217db622d1981c18d3)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
7
8declare i32 @llvm.amdgcn.workitem.id.x() #1
9declare half @llvm.fabs.f16(half)
10declare float @llvm.fabs.f32(float)
11declare double @llvm.fabs.f64(double)
12
13; All nan values are converted to 0xffffffff
14define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
15; SI-LABEL: v_cnd_nan_nosgpr:
16; SI:       ; %bb.0:
17; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
18; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
19; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
20; SI-NEXT:    s_mov_b32 s3, 0xf000
21; SI-NEXT:    s_mov_b32 s6, 0
22; SI-NEXT:    s_mov_b32 s7, s3
23; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
24; SI-NEXT:    v_mov_b32_e32 v1, 0
25; SI-NEXT:    s_waitcnt lgkmcnt(0)
26; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
27; SI-NEXT:    s_mov_b32 s2, -1
28; SI-NEXT:    s_cmp_eq_u32 s8, 0
29; SI-NEXT:    s_cselect_b64 vcc, -1, 0
30; SI-NEXT:    s_waitcnt vmcnt(0)
31; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
32; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
33; SI-NEXT:    s_endpgm
34;
35; VI-LABEL: v_cnd_nan_nosgpr:
36; VI:       ; %bb.0:
37; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
38; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
39; VI-NEXT:    s_waitcnt lgkmcnt(0)
40; VI-NEXT:    v_mov_b32_e32 v1, s1
41; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
42; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
43; VI-NEXT:    flat_load_dword v0, v[0:1]
44; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
45; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
46; VI-NEXT:    s_waitcnt lgkmcnt(0)
47; VI-NEXT:    s_cmp_eq_u32 s2, 0
48; VI-NEXT:    s_cselect_b64 vcc, -1, 0
49; VI-NEXT:    s_waitcnt vmcnt(0)
50; VI-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc
51; VI-NEXT:    v_mov_b32_e32 v0, s0
52; VI-NEXT:    v_mov_b32_e32 v1, s1
53; VI-NEXT:    flat_store_dword v[0:1], v2
54; VI-NEXT:    s_endpgm
55;
56; GFX10-LABEL: v_cnd_nan_nosgpr:
57; GFX10:       ; %bb.0:
58; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
59; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
60; GFX10-NEXT:    v_mov_b32_e32 v1, 0
61; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
62; GFX10-NEXT:    global_load_dword v0, v0, s[0:1]
63; GFX10-NEXT:    s_clause 0x1
64; GFX10-NEXT:    s_load_dword s2, s[4:5], 0x2c
65; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
66; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
67; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX10-NEXT:    s_cmp_eq_u32 s2, 0
69; GFX10-NEXT:    s_cselect_b64 vcc, -1, 0
70; GFX10-NEXT:    s_waitcnt vmcnt(0)
71; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
72; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
73; GFX10-NEXT:    s_endpgm
74;
75; GFX11-LABEL: v_cnd_nan_nosgpr:
76; GFX11:       ; %bb.0:
77; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
78; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
79; GFX11-NEXT:    v_mov_b32_e32 v1, 0
80; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
81; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
82; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
83; GFX11-NEXT:    global_load_b32 v0, v0, s[0:1]
84; GFX11-NEXT:    s_clause 0x1
85; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
86; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
87; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
88; GFX11-NEXT:    s_cmp_eq_u32 s2, 0
89; GFX11-NEXT:    s_cselect_b64 vcc, -1, 0
90; GFX11-NEXT:    s_waitcnt vmcnt(0)
91; GFX11-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
92; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
93; GFX11-NEXT:    s_endpgm
94;
95; GFX12-LABEL: v_cnd_nan_nosgpr:
96; GFX12:       ; %bb.0:
97; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
98; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
99; GFX12-NEXT:    v_mov_b32_e32 v1, 0
100; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
101; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
102; GFX12-NEXT:    s_wait_kmcnt 0x0
103; GFX12-NEXT:    global_load_b32 v0, v0, s[0:1]
104; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
105; GFX12-NEXT:    s_wait_kmcnt 0x0
106; GFX12-NEXT:    s_cmp_eq_u32 s2, 0
107; GFX12-NEXT:    s_cselect_b64 vcc, -1, 0
108; GFX12-NEXT:    s_wait_loadcnt 0x0
109; GFX12-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
110; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
111; GFX12-NEXT:    s_endpgm
112  %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
113  %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx
114  %f = load float, ptr addrspace(1) %f.gep
115  %setcc = icmp ne i32 %c, 0
116  %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
117  store float %select, ptr addrspace(1) %out
118  ret void
119}
120
121; This requires slightly trickier SGPR operand legalization since the
122; single constant bus SGPR usage is the last operand, and it should
123; never be moved.
124; However on GFX10 constant bus is limited to 2 scalar operands, not one.
125; All nan values are converted to 0xffffffff
126define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 {
127; SI-LABEL: v_cnd_nan:
128; SI:       ; %bb.0:
129; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
130; SI-NEXT:    s_mov_b32 s7, 0xf000
131; SI-NEXT:    s_mov_b32 s6, -1
132; SI-NEXT:    s_waitcnt lgkmcnt(0)
133; SI-NEXT:    s_mov_b32 s4, s0
134; SI-NEXT:    s_mov_b32 s5, s1
135; SI-NEXT:    s_cmp_eq_u32 s2, 0
136; SI-NEXT:    v_mov_b32_e32 v0, s3
137; SI-NEXT:    s_cselect_b64 vcc, -1, 0
138; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
139; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
140; SI-NEXT:    s_endpgm
141;
142; VI-LABEL: v_cnd_nan:
143; VI:       ; %bb.0:
144; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
145; VI-NEXT:    s_waitcnt lgkmcnt(0)
146; VI-NEXT:    s_cmp_eq_u32 s2, 0
147; VI-NEXT:    v_mov_b32_e32 v0, s3
148; VI-NEXT:    s_cselect_b64 vcc, -1, 0
149; VI-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc
150; VI-NEXT:    v_mov_b32_e32 v0, s0
151; VI-NEXT:    v_mov_b32_e32 v1, s1
152; VI-NEXT:    flat_store_dword v[0:1], v2
153; VI-NEXT:    s_endpgm
154;
155; GFX10-LABEL: v_cnd_nan:
156; GFX10:       ; %bb.0:
157; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
158; GFX10-NEXT:    v_mov_b32_e32 v0, 0
159; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
160; GFX10-NEXT:    s_cmp_eq_u32 s2, 0
161; GFX10-NEXT:    s_cselect_b64 s[4:5], -1, 0
162; GFX10-NEXT:    v_cndmask_b32_e64 v1, -1, s3, s[4:5]
163; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
164; GFX10-NEXT:    s_endpgm
165;
166; GFX11-LABEL: v_cnd_nan:
167; GFX11:       ; %bb.0:
168; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
169; GFX11-NEXT:    v_mov_b32_e32 v0, 0
170; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
171; GFX11-NEXT:    s_cmp_eq_u32 s2, 0
172; GFX11-NEXT:    s_cselect_b64 s[4:5], -1, 0
173; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
174; GFX11-NEXT:    v_cndmask_b32_e64 v1, -1, s3, s[4:5]
175; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
176; GFX11-NEXT:    s_endpgm
177;
178; GFX12-LABEL: v_cnd_nan:
179; GFX12:       ; %bb.0:
180; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
181; GFX12-NEXT:    v_mov_b32_e32 v0, 0
182; GFX12-NEXT:    s_wait_kmcnt 0x0
183; GFX12-NEXT:    s_cmp_eq_u32 s2, 0
184; GFX12-NEXT:    s_cselect_b32 s2, s3, -1
185; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
186; GFX12-NEXT:    v_mov_b32_e32 v1, s2
187; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
188; GFX12-NEXT:    s_endpgm
189  %setcc = icmp ne i32 %c, 0
190  %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
191  store float %select, ptr addrspace(1) %out
192  ret void
193}
194
195; Test different compare and select operand types for optimal code
196; shrinking.
197; (select (cmp (sgprX, constant)), constant, sgprZ)
198define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 {
199; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
200; SI:       ; %bb.0:
201; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
202; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x13
203; SI-NEXT:    s_mov_b32 s3, 0xf000
204; SI-NEXT:    s_mov_b32 s2, 0
205; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
206; SI-NEXT:    v_mov_b32_e32 v1, 0
207; SI-NEXT:    s_waitcnt lgkmcnt(0)
208; SI-NEXT:    v_mov_b32_e32 v2, s5
209; SI-NEXT:    v_cmp_nlg_f32_e64 vcc, s4, 0
210; SI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
211; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
212; SI-NEXT:    s_endpgm
213;
214; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
215; VI:       ; %bb.0:
216; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
217; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x4c
218; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
219; VI-NEXT:    s_waitcnt lgkmcnt(0)
220; VI-NEXT:    v_mov_b32_e32 v1, s1
221; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
222; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
223; VI-NEXT:    v_mov_b32_e32 v2, s3
224; VI-NEXT:    v_cmp_nlg_f32_e64 vcc, s2, 0
225; VI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
226; VI-NEXT:    flat_store_dword v[0:1], v2
227; VI-NEXT:    s_endpgm
228;
229; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
230; GFX10:       ; %bb.0:
231; GFX10-NEXT:    s_clause 0x1
232; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
233; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
234; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
235; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
236; GFX10-NEXT:    v_cmp_nlg_f32_e64 s[4:5], s0, 0
237; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, s1, s[4:5]
238; GFX10-NEXT:    global_store_dword v0, v1, s[2:3]
239; GFX10-NEXT:    s_endpgm
240;
241; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
242; GFX11:       ; %bb.0:
243; GFX11-NEXT:    s_clause 0x1
244; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x4c
245; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
246; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
247; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
248; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
249; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
250; GFX11-NEXT:    v_cmp_nlg_f32_e64 s[4:5], s0, 0
251; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, s1, s[4:5]
252; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
253; GFX11-NEXT:    s_endpgm
254;
255; GFX12-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
256; GFX12:       ; %bb.0:
257; GFX12-NEXT:    s_clause 0x1
258; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x4c
259; GFX12-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
260; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
261; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
262; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
263; GFX12-NEXT:    s_wait_kmcnt 0x0
264; GFX12-NEXT:    s_cmp_nlg_f32 s0, 0
265; GFX12-NEXT:    s_cselect_b32 s0, s1, 1.0
266; GFX12-NEXT:    v_mov_b32_e32 v1, s0
267; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
268; GFX12-NEXT:    s_endpgm
269  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
270  %tid.ext = sext i32 %tid to i64
271  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
272  %setcc = fcmp one float %x, 0.0
273  %select = select i1 %setcc, float 1.0, float %z
274  store float %select, ptr addrspace(1) %out.gep
275  ret void
276}
277
278define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %out, float %x) #0 {
279; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
280; SI:       ; %bb.0:
281; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
282; SI-NEXT:    s_load_dword s4, s[4:5], 0xb
283; SI-NEXT:    s_mov_b32 s3, 0xf000
284; SI-NEXT:    s_mov_b32 s2, 0
285; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
286; SI-NEXT:    v_mov_b32_e32 v1, 0
287; SI-NEXT:    s_waitcnt lgkmcnt(0)
288; SI-NEXT:    v_mov_b32_e32 v2, s4
289; SI-NEXT:    v_cmp_nlg_f32_e64 vcc, s4, 0
290; SI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
291; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
292; SI-NEXT:    s_endpgm
293;
294; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
295; VI:       ; %bb.0:
296; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
297; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
298; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
299; VI-NEXT:    s_waitcnt lgkmcnt(0)
300; VI-NEXT:    v_mov_b32_e32 v1, s1
301; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
302; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
303; VI-NEXT:    v_mov_b32_e32 v2, s2
304; VI-NEXT:    v_cmp_nlg_f32_e64 vcc, s2, 0
305; VI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
306; VI-NEXT:    flat_store_dword v[0:1], v2
307; VI-NEXT:    s_endpgm
308;
309; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
310; GFX10:       ; %bb.0:
311; GFX10-NEXT:    s_clause 0x1
312; GFX10-NEXT:    s_load_dword s6, s[4:5], 0x2c
313; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
314; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
315; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
316; GFX10-NEXT:    v_cmp_nlg_f32_e64 s[2:3], s6, 0
317; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, s6, s[2:3]
318; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
319; GFX10-NEXT:    s_endpgm
320;
321; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
322; GFX11:       ; %bb.0:
323; GFX11-NEXT:    s_clause 0x1
324; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
325; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
326; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
327; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
328; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
329; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
330; GFX11-NEXT:    v_cmp_nlg_f32_e64 s[2:3], s6, 0
331; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, s6, s[2:3]
332; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
333; GFX11-NEXT:    s_endpgm
334;
335; GFX12-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
336; GFX12:       ; %bb.0:
337; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
338; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
339; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
340; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
341; GFX12-NEXT:    s_wait_kmcnt 0x0
342; GFX12-NEXT:    s_cmp_nlg_f32 s2, 0
343; GFX12-NEXT:    s_cselect_b32 s2, s2, 1.0
344; GFX12-NEXT:    v_mov_b32_e32 v1, s2
345; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
346; GFX12-NEXT:    s_endpgm
347  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
348  %tid.ext = sext i32 %tid to i64
349  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
350  %setcc = fcmp one float %x, 0.0
351  %select = select i1 %setcc, float 1.0, float %x
352  store float %select, ptr addrspace(1) %out.gep
353  ret void
354}
355
356define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 {
357; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
358; SI:       ; %bb.0:
359; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
360; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x13
361; SI-NEXT:    s_mov_b32 s3, 0xf000
362; SI-NEXT:    s_mov_b32 s2, 0
363; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
364; SI-NEXT:    v_mov_b32_e32 v1, 0
365; SI-NEXT:    s_waitcnt lgkmcnt(0)
366; SI-NEXT:    v_mov_b32_e32 v2, s5
367; SI-NEXT:    v_cmp_nlg_f32_e64 vcc, s4, 0
368; SI-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
369; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
370; SI-NEXT:    s_endpgm
371;
372; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
373; VI:       ; %bb.0:
374; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
375; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x4c
376; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
377; VI-NEXT:    s_waitcnt lgkmcnt(0)
378; VI-NEXT:    v_mov_b32_e32 v1, s1
379; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
380; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
381; VI-NEXT:    v_mov_b32_e32 v2, s3
382; VI-NEXT:    v_cmp_nlg_f32_e64 vcc, s2, 0
383; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
384; VI-NEXT:    flat_store_dword v[0:1], v2
385; VI-NEXT:    s_endpgm
386;
387; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
388; GFX10:       ; %bb.0:
389; GFX10-NEXT:    s_clause 0x1
390; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
391; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
392; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
393; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
394; GFX10-NEXT:    v_cmp_nlg_f32_e64 s[4:5], s0, 0
395; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, s1, s[4:5]
396; GFX10-NEXT:    global_store_dword v0, v1, s[2:3]
397; GFX10-NEXT:    s_endpgm
398;
399; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
400; GFX11:       ; %bb.0:
401; GFX11-NEXT:    s_clause 0x1
402; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x4c
403; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
404; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
405; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
406; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
407; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
408; GFX11-NEXT:    v_cmp_nlg_f32_e64 s[4:5], s0, 0
409; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, s1, s[4:5]
410; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
411; GFX11-NEXT:    s_endpgm
412;
413; GFX12-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
414; GFX12:       ; %bb.0:
415; GFX12-NEXT:    s_clause 0x1
416; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x4c
417; GFX12-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
418; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
419; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
420; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
421; GFX12-NEXT:    s_wait_kmcnt 0x0
422; GFX12-NEXT:    s_cmp_nlg_f32 s0, 0
423; GFX12-NEXT:    s_cselect_b32 s0, s1, 0
424; GFX12-NEXT:    v_mov_b32_e32 v1, s0
425; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
426; GFX12-NEXT:    s_endpgm
427  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
428  %tid.ext = sext i32 %tid to i64
429  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
430  %setcc = fcmp one float %x, 0.0
431  %select = select i1 %setcc, float 0.0, float %z
432  store float %select, ptr addrspace(1) %out.gep
433  ret void
434}
435
436define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %out, float %x) #0 {
437; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
438; SI:       ; %bb.0:
439; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
440; SI-NEXT:    s_load_dword s4, s[4:5], 0xb
441; SI-NEXT:    s_mov_b32 s3, 0xf000
442; SI-NEXT:    s_mov_b32 s2, 0
443; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
444; SI-NEXT:    v_mov_b32_e32 v1, 0
445; SI-NEXT:    s_waitcnt lgkmcnt(0)
446; SI-NEXT:    v_mov_b32_e32 v2, s4
447; SI-NEXT:    v_cmp_nlg_f32_e64 vcc, s4, 0
448; SI-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
449; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
450; SI-NEXT:    s_endpgm
451;
452; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
453; VI:       ; %bb.0:
454; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
455; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
456; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
457; VI-NEXT:    s_waitcnt lgkmcnt(0)
458; VI-NEXT:    v_mov_b32_e32 v1, s1
459; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
460; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
461; VI-NEXT:    v_mov_b32_e32 v2, s2
462; VI-NEXT:    v_cmp_nlg_f32_e64 vcc, s2, 0
463; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
464; VI-NEXT:    flat_store_dword v[0:1], v2
465; VI-NEXT:    s_endpgm
466;
467; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
468; GFX10:       ; %bb.0:
469; GFX10-NEXT:    s_clause 0x1
470; GFX10-NEXT:    s_load_dword s6, s[4:5], 0x2c
471; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
472; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
473; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
474; GFX10-NEXT:    v_cmp_nlg_f32_e64 s[2:3], s6, 0
475; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, s6, s[2:3]
476; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
477; GFX10-NEXT:    s_endpgm
478;
479; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
480; GFX11:       ; %bb.0:
481; GFX11-NEXT:    s_clause 0x1
482; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
483; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
484; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
485; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
486; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
487; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
488; GFX11-NEXT:    v_cmp_nlg_f32_e64 s[2:3], s6, 0
489; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, s6, s[2:3]
490; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
491; GFX11-NEXT:    s_endpgm
492;
493; GFX12-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
494; GFX12:       ; %bb.0:
495; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
496; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
497; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
498; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
499; GFX12-NEXT:    s_wait_kmcnt 0x0
500; GFX12-NEXT:    s_cmp_nlg_f32 s2, 0
501; GFX12-NEXT:    s_cselect_b32 s2, s2, 0
502; GFX12-NEXT:    v_mov_b32_e32 v1, s2
503; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
504; GFX12-NEXT:    s_endpgm
505  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
506  %tid.ext = sext i32 %tid to i64
507  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
508  %setcc = fcmp one float %x, 0.0
509  %select = select i1 %setcc, float 0.0, float %x
510  store float %select, ptr addrspace(1) %out.gep
511  ret void
512}
513
514define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 {
515; SI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
516; SI:       ; %bb.0:
517; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
518; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
519; SI-NEXT:    s_mov_b32 s3, 0xf000
520; SI-NEXT:    s_mov_b32 s2, 0
521; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
522; SI-NEXT:    v_mov_b32_e32 v1, 0
523; SI-NEXT:    s_waitcnt lgkmcnt(0)
524; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
525; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
526; SI-NEXT:    v_cmp_nlg_f32_e64 vcc, s6, 0
527; SI-NEXT:    s_waitcnt vmcnt(0)
528; SI-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
529; SI-NEXT:    s_waitcnt lgkmcnt(0)
530; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
531; SI-NEXT:    s_endpgm
532;
533; VI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
534; VI:       ; %bb.0:
535; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
536; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
537; VI-NEXT:    s_waitcnt lgkmcnt(0)
538; VI-NEXT:    v_mov_b32_e32 v1, s1
539; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
540; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
541; VI-NEXT:    flat_load_dword v3, v[0:1]
542; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
543; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
544; VI-NEXT:    s_waitcnt lgkmcnt(0)
545; VI-NEXT:    v_mov_b32_e32 v1, s1
546; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
547; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
548; VI-NEXT:    v_cmp_nlg_f32_e64 vcc, s2, 0
549; VI-NEXT:    s_waitcnt vmcnt(0)
550; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
551; VI-NEXT:    flat_store_dword v[0:1], v2
552; VI-NEXT:    s_endpgm
553;
554; GFX10-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
555; GFX10:       ; %bb.0:
556; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
557; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
558; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
559; GFX10-NEXT:    global_load_dword v1, v0, s[0:1]
560; GFX10-NEXT:    s_clause 0x1
561; GFX10-NEXT:    s_load_dword s2, s[4:5], 0x2c
562; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
563; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
564; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
565; GFX10-NEXT:    v_cmp_nlg_f32_e64 vcc, s2, 0
566; GFX10-NEXT:    s_waitcnt vmcnt(0)
567; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
568; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
569; GFX10-NEXT:    s_endpgm
570;
571; GFX11-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
572; GFX11:       ; %bb.0:
573; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
574; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
575; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
576; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
577; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
578; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1]
579; GFX11-NEXT:    s_clause 0x1
580; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
581; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
582; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
583; GFX11-NEXT:    v_cmp_nlg_f32_e64 vcc, s2, 0
584; GFX11-NEXT:    s_waitcnt vmcnt(0)
585; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
586; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
587; GFX11-NEXT:    s_endpgm
588;
589; GFX12-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
590; GFX12:       ; %bb.0:
591; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
592; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
593; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
594; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
595; GFX12-NEXT:    s_wait_kmcnt 0x0
596; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
597; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
598; GFX12-NEXT:    s_wait_kmcnt 0x0
599; GFX12-NEXT:    s_cmp_nlg_f32 s2, 0
600; GFX12-NEXT:    s_cselect_b64 vcc, -1, 0
601; GFX12-NEXT:    s_wait_loadcnt 0x0
602; GFX12-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
603; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
604; GFX12-NEXT:    s_endpgm
605  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
606  %tid.ext = sext i32 %tid to i64
607  %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
608  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
609  %z = load float, ptr addrspace(1) %z.gep
610  %setcc = fcmp one float %x, 0.0
611  %select = select i1 %setcc, float 0.0, float %z
612  store float %select, ptr addrspace(1) %out.gep
613  ret void
614}
615
616define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 {
617; SI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
618; SI:       ; %bb.0:
619; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
620; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
621; SI-NEXT:    s_mov_b32 s3, 0xf000
622; SI-NEXT:    s_mov_b32 s2, 0
623; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
624; SI-NEXT:    v_mov_b32_e32 v1, 0
625; SI-NEXT:    s_waitcnt lgkmcnt(0)
626; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
627; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
628; SI-NEXT:    v_cmp_nlg_f32_e64 vcc, s6, 0
629; SI-NEXT:    s_waitcnt vmcnt(0)
630; SI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
631; SI-NEXT:    s_waitcnt lgkmcnt(0)
632; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
633; SI-NEXT:    s_endpgm
634;
635; VI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
636; VI:       ; %bb.0:
637; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
638; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
639; VI-NEXT:    s_waitcnt lgkmcnt(0)
640; VI-NEXT:    v_mov_b32_e32 v1, s1
641; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
642; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
643; VI-NEXT:    flat_load_dword v3, v[0:1]
644; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
645; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
646; VI-NEXT:    s_waitcnt lgkmcnt(0)
647; VI-NEXT:    v_mov_b32_e32 v1, s1
648; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
649; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
650; VI-NEXT:    v_cmp_nlg_f32_e64 vcc, s2, 0
651; VI-NEXT:    s_waitcnt vmcnt(0)
652; VI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
653; VI-NEXT:    flat_store_dword v[0:1], v2
654; VI-NEXT:    s_endpgm
655;
656; GFX10-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
657; GFX10:       ; %bb.0:
658; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
659; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
660; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
661; GFX10-NEXT:    global_load_dword v1, v0, s[0:1]
662; GFX10-NEXT:    s_clause 0x1
663; GFX10-NEXT:    s_load_dword s2, s[4:5], 0x2c
664; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
665; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
666; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
667; GFX10-NEXT:    v_cmp_nlg_f32_e64 vcc, s2, 0
668; GFX10-NEXT:    s_waitcnt vmcnt(0)
669; GFX10-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
670; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
671; GFX10-NEXT:    s_endpgm
672;
673; GFX11-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
674; GFX11:       ; %bb.0:
675; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
676; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
677; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
678; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
679; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
680; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1]
681; GFX11-NEXT:    s_clause 0x1
682; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
683; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
684; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
685; GFX11-NEXT:    v_cmp_nlg_f32_e64 vcc, s2, 0
686; GFX11-NEXT:    s_waitcnt vmcnt(0)
687; GFX11-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
688; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
689; GFX11-NEXT:    s_endpgm
690;
691; GFX12-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
692; GFX12:       ; %bb.0:
693; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
694; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
695; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
696; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
697; GFX12-NEXT:    s_wait_kmcnt 0x0
698; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
699; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
700; GFX12-NEXT:    s_wait_kmcnt 0x0
701; GFX12-NEXT:    s_cmp_nlg_f32 s2, 0
702; GFX12-NEXT:    s_cselect_b64 vcc, -1, 0
703; GFX12-NEXT:    s_wait_loadcnt 0x0
704; GFX12-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
705; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
706; GFX12-NEXT:    s_endpgm
707  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
708  %tid.ext = sext i32 %tid to i64
709  %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
710  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
711  %z = load float, ptr addrspace(1) %z.gep
712  %setcc = fcmp one float %x, 0.0
713  %select = select i1 %setcc, float 1.0, float %z
714  store float %select, ptr addrspace(1) %out.gep
715  ret void
716}
717
718define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, float %z) #0 {
719; SI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
720; SI:       ; %bb.0:
721; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
722; SI-NEXT:    s_load_dword s8, s[4:5], 0xd
723; SI-NEXT:    s_mov_b32 s7, 0xf000
724; SI-NEXT:    s_mov_b32 s6, 0
725; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
726; SI-NEXT:    v_mov_b32_e32 v1, 0
727; SI-NEXT:    s_waitcnt lgkmcnt(0)
728; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
729; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
730; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
731; SI-NEXT:    v_mov_b32_e32 v3, s8
732; SI-NEXT:    s_waitcnt vmcnt(0)
733; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v2
734; SI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
735; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
736; SI-NEXT:    s_endpgm
737;
738; VI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
739; VI:       ; %bb.0:
740; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
741; VI-NEXT:    s_load_dword s4, s[4:5], 0x34
742; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
743; VI-NEXT:    s_waitcnt lgkmcnt(0)
744; VI-NEXT:    v_mov_b32_e32 v1, s3
745; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
746; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
747; VI-NEXT:    flat_load_dword v3, v[0:1]
748; VI-NEXT:    v_mov_b32_e32 v1, s1
749; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
750; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
751; VI-NEXT:    v_mov_b32_e32 v4, s4
752; VI-NEXT:    s_waitcnt vmcnt(0)
753; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v3
754; VI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
755; VI-NEXT:    flat_store_dword v[0:1], v2
756; VI-NEXT:    s_endpgm
757;
758; GFX10-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
759; GFX10:       ; %bb.0:
760; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
761; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
762; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x34
763; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
764; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
765; GFX10-NEXT:    s_waitcnt vmcnt(0)
766; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
767; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, s4, vcc
768; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
769; GFX10-NEXT:    s_endpgm
770;
771; GFX11-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
772; GFX11:       ; %bb.0:
773; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
774; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
775; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
776; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
777; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
778; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
779; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
780; GFX11-NEXT:    s_waitcnt vmcnt(0)
781; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
782; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, s4, vcc
783; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
784; GFX11-NEXT:    s_endpgm
785;
786; GFX12-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
787; GFX12:       ; %bb.0:
788; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
789; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
790; GFX12-NEXT:    s_load_b32 s4, s[4:5], 0x34
791; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
792; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
793; GFX12-NEXT:    s_wait_kmcnt 0x0
794; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
795; GFX12-NEXT:    s_wait_loadcnt 0x0
796; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
797; GFX12-NEXT:    v_cndmask_b32_e64 v1, 1.0, s4, vcc
798; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
799; GFX12-NEXT:    s_endpgm
800  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
801  %tid.ext = sext i32 %tid to i64
802  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
803  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
804  %x = load float, ptr addrspace(1) %x.gep
805  %setcc = fcmp olt float %x, 0.0
806  %select = select i1 %setcc, float 1.0, float %z
807  store float %select, ptr addrspace(1) %out.gep
808  ret void
809}
810
811define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
812; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
813; SI:       ; %bb.0:
814; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
815; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
816; SI-NEXT:    s_mov_b32 s11, 0xf000
817; SI-NEXT:    s_mov_b32 s10, 0
818; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
819; SI-NEXT:    v_mov_b32_e32 v1, 0
820; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
821; SI-NEXT:    s_waitcnt lgkmcnt(0)
822; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
823; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
824; SI-NEXT:    s_waitcnt vmcnt(0)
825; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
826; SI-NEXT:    s_waitcnt vmcnt(0)
827; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
828; SI-NEXT:    v_cmp_le_f32_e32 vcc, 0, v2
829; SI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
830; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
831; SI-NEXT:    s_endpgm
832;
833; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
834; VI:       ; %bb.0:
835; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
836; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
837; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
838; VI-NEXT:    s_waitcnt lgkmcnt(0)
839; VI-NEXT:    v_mov_b32_e32 v1, s3
840; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
841; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
842; VI-NEXT:    v_mov_b32_e32 v3, s5
843; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
844; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
845; VI-NEXT:    flat_load_dword v5, v[0:1] glc
846; VI-NEXT:    s_waitcnt vmcnt(0)
847; VI-NEXT:    flat_load_dword v2, v[2:3] glc
848; VI-NEXT:    s_waitcnt vmcnt(0)
849; VI-NEXT:    v_mov_b32_e32 v1, s1
850; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
851; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
852; VI-NEXT:    v_cmp_le_f32_e32 vcc, 0, v5
853; VI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
854; VI-NEXT:    flat_store_dword v[0:1], v2
855; VI-NEXT:    s_endpgm
856;
857; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
858; GFX10:       ; %bb.0:
859; GFX10-NEXT:    s_clause 0x1
860; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
861; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
862; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
863; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
864; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
865; GFX10-NEXT:    s_waitcnt vmcnt(0)
866; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
867; GFX10-NEXT:    s_waitcnt vmcnt(0)
868; GFX10-NEXT:    v_cmp_le_f32_e32 vcc, 0, v1
869; GFX10-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
870; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
871; GFX10-NEXT:    s_endpgm
872;
873; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
874; GFX11:       ; %bb.0:
875; GFX11-NEXT:    s_clause 0x1
876; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
877; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
878; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
879; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
880; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
881; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
882; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
883; GFX11-NEXT:    s_waitcnt vmcnt(0)
884; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5] glc dlc
885; GFX11-NEXT:    s_waitcnt vmcnt(0)
886; GFX11-NEXT:    v_cmp_le_f32_e32 vcc, 0, v1
887; GFX11-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
888; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
889; GFX11-NEXT:    s_endpgm
890;
891; GFX12-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
892; GFX12:       ; %bb.0:
893; GFX12-NEXT:    s_clause 0x1
894; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
895; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
896; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
897; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
898; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
899; GFX12-NEXT:    s_wait_kmcnt 0x0
900; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
901; GFX12-NEXT:    s_wait_loadcnt 0x0
902; GFX12-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
903; GFX12-NEXT:    s_wait_loadcnt 0x0
904; GFX12-NEXT:    v_cmp_le_f32_e32 vcc, 0, v1
905; GFX12-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
906; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
907; GFX12-NEXT:    s_endpgm
908  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
909  %tid.ext = sext i32 %tid to i64
910  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
911  %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
912  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
913  %x = load volatile float, ptr addrspace(1) %x.gep
914  %z = load volatile float, ptr addrspace(1) %z.gep
915  %setcc = fcmp ult float %x, 0.0
916  %select = select i1 %setcc, float 1.0, float %z
917  store float %select, ptr addrspace(1) %out.gep
918  ret void
919}
920
921define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
922; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
923; SI:       ; %bb.0:
924; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
925; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
926; SI-NEXT:    s_mov_b32 s11, 0xf000
927; SI-NEXT:    s_mov_b32 s10, 0
928; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
929; SI-NEXT:    v_mov_b32_e32 v1, 0
930; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
931; SI-NEXT:    s_waitcnt lgkmcnt(0)
932; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
933; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
934; SI-NEXT:    s_waitcnt vmcnt(0)
935; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
936; SI-NEXT:    s_waitcnt vmcnt(0)
937; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
938; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
939; SI-NEXT:    v_cndmask_b32_e32 v2, 2, v3, vcc
940; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
941; SI-NEXT:    s_endpgm
942;
943; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
944; VI:       ; %bb.0:
945; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
946; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
947; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
948; VI-NEXT:    s_waitcnt lgkmcnt(0)
949; VI-NEXT:    v_mov_b32_e32 v1, s3
950; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
951; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
952; VI-NEXT:    v_mov_b32_e32 v3, s5
953; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
954; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
955; VI-NEXT:    flat_load_dword v5, v[0:1] glc
956; VI-NEXT:    s_waitcnt vmcnt(0)
957; VI-NEXT:    flat_load_dword v2, v[2:3] glc
958; VI-NEXT:    s_waitcnt vmcnt(0)
959; VI-NEXT:    v_mov_b32_e32 v1, s1
960; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
961; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
962; VI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v5
963; VI-NEXT:    v_cndmask_b32_e32 v2, 2, v2, vcc
964; VI-NEXT:    flat_store_dword v[0:1], v2
965; VI-NEXT:    s_endpgm
966;
967; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
968; GFX10:       ; %bb.0:
969; GFX10-NEXT:    s_clause 0x1
970; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
971; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
972; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
973; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
974; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
975; GFX10-NEXT:    s_waitcnt vmcnt(0)
976; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
977; GFX10-NEXT:    s_waitcnt vmcnt(0)
978; GFX10-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
979; GFX10-NEXT:    v_cndmask_b32_e32 v1, 2, v2, vcc
980; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
981; GFX10-NEXT:    s_endpgm
982;
983; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
984; GFX11:       ; %bb.0:
985; GFX11-NEXT:    s_clause 0x1
986; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
987; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
988; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
989; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
990; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
991; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
992; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
993; GFX11-NEXT:    s_waitcnt vmcnt(0)
994; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5] glc dlc
995; GFX11-NEXT:    s_waitcnt vmcnt(0)
996; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
997; GFX11-NEXT:    v_cndmask_b32_e32 v1, 2, v2, vcc
998; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
999; GFX11-NEXT:    s_endpgm
1000;
1001; GFX12-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
1002; GFX12:       ; %bb.0:
1003; GFX12-NEXT:    s_clause 0x1
1004; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1005; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1006; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1007; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1008; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1009; GFX12-NEXT:    s_wait_kmcnt 0x0
1010; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
1011; GFX12-NEXT:    s_wait_loadcnt 0x0
1012; GFX12-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
1013; GFX12-NEXT:    s_wait_loadcnt 0x0
1014; GFX12-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
1015; GFX12-NEXT:    v_cndmask_b32_e32 v1, 2, v2, vcc
1016; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
1017; GFX12-NEXT:    s_endpgm
1018  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1019  %tid.ext = sext i32 %tid to i64
1020  %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
1021  %z.gep = getelementptr inbounds i32, ptr addrspace(1) %z.ptr, i64 %tid.ext
1022  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
1023  %x = load volatile i32, ptr addrspace(1) %x.gep
1024  %z = load volatile i32, ptr addrspace(1) %z.gep
1025  %setcc = icmp slt i32 %x, 0
1026  %select = select i1 %setcc, i32 2, i32 %z
1027  store i32 %select, ptr addrspace(1) %out.gep
1028  ret void
1029}
1030
1031define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1032; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
1033; SI:       ; %bb.0:
1034; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1035; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1036; SI-NEXT:    s_mov_b32 s11, 0xf000
1037; SI-NEXT:    s_mov_b32 s10, 0
1038; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1039; SI-NEXT:    v_mov_b32_e32 v1, 0
1040; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
1041; SI-NEXT:    s_waitcnt lgkmcnt(0)
1042; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1043; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 glc
1044; SI-NEXT:    s_waitcnt vmcnt(0)
1045; SI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 glc
1046; SI-NEXT:    s_waitcnt vmcnt(0)
1047; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
1048; SI-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[2:3]
1049; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
1050; SI-NEXT:    v_cndmask_b32_e32 v2, 2, v4, vcc
1051; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
1052; SI-NEXT:    s_endpgm
1053;
1054; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
1055; VI:       ; %bb.0:
1056; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1057; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1058; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1059; VI-NEXT:    s_waitcnt lgkmcnt(0)
1060; VI-NEXT:    v_mov_b32_e32 v1, s3
1061; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
1062; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1063; VI-NEXT:    v_mov_b32_e32 v3, s5
1064; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
1065; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1066; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
1067; VI-NEXT:    s_waitcnt vmcnt(0)
1068; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] glc
1069; VI-NEXT:    s_waitcnt vmcnt(0)
1070; VI-NEXT:    v_mov_b32_e32 v5, s1
1071; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
1072; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
1073; VI-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[0:1]
1074; VI-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
1075; VI-NEXT:    v_cndmask_b32_e32 v0, 2, v2, vcc
1076; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
1077; VI-NEXT:    s_endpgm
1078;
1079; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
1080; GFX10:       ; %bb.0:
1081; GFX10-NEXT:    s_clause 0x1
1082; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1083; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1084; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1085; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1086; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
1087; GFX10-NEXT:    s_waitcnt vmcnt(0)
1088; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7] glc dlc
1089; GFX10-NEXT:    s_waitcnt vmcnt(0)
1090; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[0:1]
1091; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
1092; GFX10-NEXT:    v_cndmask_b32_e32 v0, 2, v2, vcc
1093; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
1094; GFX10-NEXT:    s_endpgm
1095;
1096; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
1097; GFX11:       ; %bb.0:
1098; GFX11-NEXT:    s_clause 0x1
1099; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1100; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1101; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1102; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1103; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1104; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1105; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[2:3] glc dlc
1106; GFX11-NEXT:    s_waitcnt vmcnt(0)
1107; GFX11-NEXT:    global_load_b64 v[2:3], v4, s[4:5] glc dlc
1108; GFX11-NEXT:    s_waitcnt vmcnt(0)
1109; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[0:1]
1110; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
1111; GFX11-NEXT:    v_cndmask_b32_e32 v0, 2, v2, vcc
1112; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
1113; GFX11-NEXT:    s_endpgm
1114;
1115; GFX12-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
1116; GFX12:       ; %bb.0:
1117; GFX12-NEXT:    s_clause 0x1
1118; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1119; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1120; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1121; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1122; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1123; GFX12-NEXT:    s_wait_kmcnt 0x0
1124; GFX12-NEXT:    global_load_b64 v[0:1], v4, s[2:3] scope:SCOPE_SYS
1125; GFX12-NEXT:    s_wait_loadcnt 0x0
1126; GFX12-NEXT:    global_load_b64 v[2:3], v4, s[4:5] scope:SCOPE_SYS
1127; GFX12-NEXT:    s_wait_loadcnt 0x0
1128; GFX12-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[0:1]
1129; GFX12-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
1130; GFX12-NEXT:    v_cndmask_b32_e32 v0, 2, v2, vcc
1131; GFX12-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
1132; GFX12-NEXT:    s_endpgm
1133  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1134  %tid.ext = sext i32 %tid to i64
1135  %x.gep = getelementptr inbounds i64, ptr addrspace(1) %x.ptr, i64 %tid.ext
1136  %z.gep = getelementptr inbounds i64, ptr addrspace(1) %z.ptr, i64 %tid.ext
1137  %out.gep = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %tid.ext
1138  %x = load volatile i64, ptr addrspace(1) %x.gep
1139  %z = load volatile i64, ptr addrspace(1) %z.gep
1140  %setcc = icmp slt i64 %x, 0
1141  %select = select i1 %setcc, i64 2, i64 %z
1142  store i64 %select, ptr addrspace(1) %out.gep
1143  ret void
1144}
1145
1146define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1147; SI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
1148; SI:       ; %bb.0:
1149; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1150; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1151; SI-NEXT:    s_mov_b32 s11, 0xf000
1152; SI-NEXT:    s_mov_b32 s10, 0
1153; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1154; SI-NEXT:    v_mov_b32_e32 v2, 0
1155; SI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1156; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
1157; SI-NEXT:    v_mov_b32_e32 v5, v2
1158; SI-NEXT:    s_waitcnt lgkmcnt(0)
1159; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1160; SI-NEXT:    buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc
1161; SI-NEXT:    s_waitcnt vmcnt(0)
1162; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 glc
1163; SI-NEXT:    s_waitcnt vmcnt(0)
1164; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
1165; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 4.0, v6
1166; SI-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
1167; SI-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
1168; SI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
1169; SI-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
1170; SI-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
1171; SI-NEXT:    s_endpgm
1172;
1173; VI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
1174; VI:       ; %bb.0:
1175; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1176; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1177; VI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1178; VI-NEXT:    v_lshlrev_b32_e32 v5, 4, v0
1179; VI-NEXT:    s_waitcnt lgkmcnt(0)
1180; VI-NEXT:    v_mov_b32_e32 v2, s3
1181; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
1182; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
1183; VI-NEXT:    v_mov_b32_e32 v0, s5
1184; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v5
1185; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v0, vcc
1186; VI-NEXT:    flat_load_dword v6, v[1:2] glc
1187; VI-NEXT:    s_waitcnt vmcnt(0)
1188; VI-NEXT:    flat_load_dwordx4 v[0:3], v[3:4] glc
1189; VI-NEXT:    s_waitcnt vmcnt(0)
1190; VI-NEXT:    v_mov_b32_e32 v7, s1
1191; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v5
1192; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
1193; VI-NEXT:    v_cmp_nge_f32_e32 vcc, 4.0, v6
1194; VI-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
1195; VI-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
1196; VI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
1197; VI-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
1198; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1199; VI-NEXT:    s_endpgm
1200;
1201; GFX10-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
1202; GFX10:       ; %bb.0:
1203; GFX10-NEXT:    s_clause 0x1
1204; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1205; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1206; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
1207; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 4, v0
1208; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1209; GFX10-NEXT:    global_load_dword v6, v4, s[2:3] glc dlc
1210; GFX10-NEXT:    s_waitcnt vmcnt(0)
1211; GFX10-NEXT:    global_load_dwordx4 v[0:3], v5, s[6:7] glc dlc
1212; GFX10-NEXT:    s_waitcnt vmcnt(0)
1213; GFX10-NEXT:    v_cmp_nge_f32_e32 vcc, 4.0, v6
1214; GFX10-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
1215; GFX10-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
1216; GFX10-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
1217; GFX10-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
1218; GFX10-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
1219; GFX10-NEXT:    s_endpgm
1220;
1221; GFX11-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
1222; GFX11:       ; %bb.0:
1223; GFX11-NEXT:    s_clause 0x1
1224; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1225; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1226; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1227; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1228; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1229; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1230; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1231; GFX11-NEXT:    global_load_b32 v5, v1, s[2:3] glc dlc
1232; GFX11-NEXT:    s_waitcnt vmcnt(0)
1233; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[4:5] glc dlc
1234; GFX11-NEXT:    s_waitcnt vmcnt(0)
1235; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc, 4.0, v5
1236; GFX11-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
1237; GFX11-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
1238; GFX11-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
1239; GFX11-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
1240; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
1241; GFX11-NEXT:    s_endpgm
1242;
1243; GFX12-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
1244; GFX12:       ; %bb.0:
1245; GFX12-NEXT:    s_clause 0x1
1246; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1247; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1248; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1249; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1250; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1251; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1252; GFX12-NEXT:    s_wait_kmcnt 0x0
1253; GFX12-NEXT:    global_load_b32 v5, v1, s[2:3] scope:SCOPE_SYS
1254; GFX12-NEXT:    s_wait_loadcnt 0x0
1255; GFX12-NEXT:    global_load_b128 v[0:3], v4, s[4:5] scope:SCOPE_SYS
1256; GFX12-NEXT:    s_wait_loadcnt 0x0
1257; GFX12-NEXT:    v_cmp_nge_f32_e32 vcc, 4.0, v5
1258; GFX12-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
1259; GFX12-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
1260; GFX12-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
1261; GFX12-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
1262; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
1263; GFX12-NEXT:    s_endpgm
1264  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1265  %tid.ext = sext i32 %tid to i64
1266  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
1267  %z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext
1268  %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
1269  %x = load volatile float, ptr addrspace(1) %x.gep
1270  %z = load volatile <4 x float>, ptr addrspace(1) %z.gep
1271  %setcc = fcmp ugt float %x, 4.0
1272  %select = select i1 %setcc, <4 x float> %z, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>
1273  store <4 x float> %select, ptr addrspace(1) %out.gep
1274  ret void
1275}
1276
1277define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1278; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
1279; SI:       ; %bb.0:
1280; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1281; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1282; SI-NEXT:    s_mov_b32 s11, 0xf000
1283; SI-NEXT:    s_mov_b32 s10, 0
1284; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1285; SI-NEXT:    v_mov_b32_e32 v2, 0
1286; SI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1287; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
1288; SI-NEXT:    v_mov_b32_e32 v5, v2
1289; SI-NEXT:    s_waitcnt lgkmcnt(0)
1290; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1291; SI-NEXT:    buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc
1292; SI-NEXT:    s_waitcnt vmcnt(0)
1293; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 glc
1294; SI-NEXT:    s_waitcnt vmcnt(0)
1295; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
1296; SI-NEXT:    v_cmp_ge_f32_e32 vcc, 4.0, v6
1297; SI-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
1298; SI-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
1299; SI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
1300; SI-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
1301; SI-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
1302; SI-NEXT:    s_endpgm
1303;
1304; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
1305; VI:       ; %bb.0:
1306; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1307; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1308; VI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1309; VI-NEXT:    v_lshlrev_b32_e32 v5, 4, v0
1310; VI-NEXT:    s_waitcnt lgkmcnt(0)
1311; VI-NEXT:    v_mov_b32_e32 v2, s3
1312; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
1313; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
1314; VI-NEXT:    v_mov_b32_e32 v0, s5
1315; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v5
1316; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v0, vcc
1317; VI-NEXT:    flat_load_dword v6, v[1:2] glc
1318; VI-NEXT:    s_waitcnt vmcnt(0)
1319; VI-NEXT:    flat_load_dwordx4 v[0:3], v[3:4] glc
1320; VI-NEXT:    s_waitcnt vmcnt(0)
1321; VI-NEXT:    v_mov_b32_e32 v7, s1
1322; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v5
1323; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
1324; VI-NEXT:    v_cmp_ge_f32_e32 vcc, 4.0, v6
1325; VI-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
1326; VI-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
1327; VI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
1328; VI-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
1329; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1330; VI-NEXT:    s_endpgm
1331;
1332; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
1333; GFX10:       ; %bb.0:
1334; GFX10-NEXT:    s_clause 0x1
1335; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1336; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1337; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
1338; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 4, v0
1339; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1340; GFX10-NEXT:    global_load_dword v6, v4, s[2:3] glc dlc
1341; GFX10-NEXT:    s_waitcnt vmcnt(0)
1342; GFX10-NEXT:    global_load_dwordx4 v[0:3], v5, s[6:7] glc dlc
1343; GFX10-NEXT:    s_waitcnt vmcnt(0)
1344; GFX10-NEXT:    v_cmp_ge_f32_e32 vcc, 4.0, v6
1345; GFX10-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
1346; GFX10-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
1347; GFX10-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
1348; GFX10-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
1349; GFX10-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
1350; GFX10-NEXT:    s_endpgm
1351;
1352; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
1353; GFX11:       ; %bb.0:
1354; GFX11-NEXT:    s_clause 0x1
1355; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1356; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1357; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1358; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1359; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1360; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1361; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1362; GFX11-NEXT:    global_load_b32 v5, v1, s[2:3] glc dlc
1363; GFX11-NEXT:    s_waitcnt vmcnt(0)
1364; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[4:5] glc dlc
1365; GFX11-NEXT:    s_waitcnt vmcnt(0)
1366; GFX11-NEXT:    v_cmp_ge_f32_e32 vcc, 4.0, v5
1367; GFX11-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
1368; GFX11-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
1369; GFX11-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
1370; GFX11-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
1371; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
1372; GFX11-NEXT:    s_endpgm
1373;
1374; GFX12-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
1375; GFX12:       ; %bb.0:
1376; GFX12-NEXT:    s_clause 0x1
1377; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1378; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1379; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1380; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1381; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1382; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1383; GFX12-NEXT:    s_wait_kmcnt 0x0
1384; GFX12-NEXT:    global_load_b32 v5, v1, s[2:3] scope:SCOPE_SYS
1385; GFX12-NEXT:    s_wait_loadcnt 0x0
1386; GFX12-NEXT:    global_load_b128 v[0:3], v4, s[4:5] scope:SCOPE_SYS
1387; GFX12-NEXT:    s_wait_loadcnt 0x0
1388; GFX12-NEXT:    v_cmp_ge_f32_e32 vcc, 4.0, v5
1389; GFX12-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
1390; GFX12-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
1391; GFX12-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
1392; GFX12-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
1393; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
1394; GFX12-NEXT:    s_endpgm
1395  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1396  %tid.ext = sext i32 %tid to i64
1397  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
1398  %z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext
1399  %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
1400  %x = load volatile float, ptr addrspace(1) %x.gep
1401  %z = load volatile <4 x float>, ptr addrspace(1) %z.gep
1402  %setcc = fcmp ugt float %x, 4.0
1403  %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z
1404  store <4 x float> %select, ptr addrspace(1) %out.gep
1405  ret void
1406}
1407
1408; This must be swapped as a vector type before the condition has
1409; multiple uses.
1410define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1411; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
1412; SI:       ; %bb.0:
1413; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1414; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1415; SI-NEXT:    s_mov_b32 s11, 0xf000
1416; SI-NEXT:    s_mov_b32 s10, 0
1417; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1418; SI-NEXT:    v_mov_b32_e32 v2, 0
1419; SI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1420; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
1421; SI-NEXT:    v_mov_b32_e32 v5, v2
1422; SI-NEXT:    s_waitcnt lgkmcnt(0)
1423; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1424; SI-NEXT:    buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc
1425; SI-NEXT:    s_waitcnt vmcnt(0)
1426; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 glc
1427; SI-NEXT:    s_waitcnt vmcnt(0)
1428; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
1429; SI-NEXT:    v_cmp_le_f32_e32 vcc, 4.0, v6
1430; SI-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
1431; SI-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
1432; SI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
1433; SI-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
1434; SI-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
1435; SI-NEXT:    s_endpgm
1436;
1437; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
1438; VI:       ; %bb.0:
1439; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1440; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1441; VI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1442; VI-NEXT:    v_lshlrev_b32_e32 v5, 4, v0
1443; VI-NEXT:    s_waitcnt lgkmcnt(0)
1444; VI-NEXT:    v_mov_b32_e32 v2, s3
1445; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
1446; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
1447; VI-NEXT:    v_mov_b32_e32 v0, s5
1448; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v5
1449; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v0, vcc
1450; VI-NEXT:    flat_load_dword v6, v[1:2] glc
1451; VI-NEXT:    s_waitcnt vmcnt(0)
1452; VI-NEXT:    flat_load_dwordx4 v[0:3], v[3:4] glc
1453; VI-NEXT:    s_waitcnt vmcnt(0)
1454; VI-NEXT:    v_mov_b32_e32 v7, s1
1455; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v5
1456; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
1457; VI-NEXT:    v_cmp_le_f32_e32 vcc, 4.0, v6
1458; VI-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
1459; VI-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
1460; VI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
1461; VI-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
1462; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1463; VI-NEXT:    s_endpgm
1464;
1465; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
1466; GFX10:       ; %bb.0:
1467; GFX10-NEXT:    s_clause 0x1
1468; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1469; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1470; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
1471; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 4, v0
1472; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1473; GFX10-NEXT:    global_load_dword v6, v4, s[2:3] glc dlc
1474; GFX10-NEXT:    s_waitcnt vmcnt(0)
1475; GFX10-NEXT:    global_load_dwordx4 v[0:3], v5, s[6:7] glc dlc
1476; GFX10-NEXT:    s_waitcnt vmcnt(0)
1477; GFX10-NEXT:    v_cmp_le_f32_e32 vcc, 4.0, v6
1478; GFX10-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
1479; GFX10-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
1480; GFX10-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
1481; GFX10-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
1482; GFX10-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
1483; GFX10-NEXT:    s_endpgm
1484;
1485; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
1486; GFX11:       ; %bb.0:
1487; GFX11-NEXT:    s_clause 0x1
1488; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1489; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1490; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1491; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1492; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1493; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1494; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1495; GFX11-NEXT:    global_load_b32 v5, v1, s[2:3] glc dlc
1496; GFX11-NEXT:    s_waitcnt vmcnt(0)
1497; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[4:5] glc dlc
1498; GFX11-NEXT:    s_waitcnt vmcnt(0)
1499; GFX11-NEXT:    v_cmp_le_f32_e32 vcc, 4.0, v5
1500; GFX11-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
1501; GFX11-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
1502; GFX11-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
1503; GFX11-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
1504; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
1505; GFX11-NEXT:    s_endpgm
1506;
1507; GFX12-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
1508; GFX12:       ; %bb.0:
1509; GFX12-NEXT:    s_clause 0x1
1510; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1511; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1512; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1513; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1514; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1515; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
1516; GFX12-NEXT:    s_wait_kmcnt 0x0
1517; GFX12-NEXT:    global_load_b32 v5, v1, s[2:3] scope:SCOPE_SYS
1518; GFX12-NEXT:    s_wait_loadcnt 0x0
1519; GFX12-NEXT:    global_load_b128 v[0:3], v4, s[4:5] scope:SCOPE_SYS
1520; GFX12-NEXT:    s_wait_loadcnt 0x0
1521; GFX12-NEXT:    v_cmp_le_f32_e32 vcc, 4.0, v5
1522; GFX12-NEXT:    v_cndmask_b32_e32 v3, 4.0, v3, vcc
1523; GFX12-NEXT:    v_cndmask_b32_e32 v2, -0.5, v2, vcc
1524; GFX12-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
1525; GFX12-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
1526; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
1527; GFX12-NEXT:    s_endpgm
1528  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1529  %tid.ext = sext i32 %tid to i64
1530  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
1531  %z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext
1532  %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
1533  %x = load volatile float, ptr addrspace(1) %x.gep
1534  %z = load volatile <4 x float>, ptr addrspace(1) %z.gep
1535  %setcc = fcmp ugt float 4.0, %x
1536  %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z
1537  store <4 x float> %select, ptr addrspace(1) %out.gep
1538  ret void
1539}
1540
1541define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1542; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
1543; SI:       ; %bb.0:
1544; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
1545; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
1546; SI-NEXT:    s_mov_b32 s6, 0
1547; SI-NEXT:    v_mov_b32_e32 v1, 0
1548; SI-NEXT:    s_mov_b32 s7, 0xf000
1549; SI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1550; SI-NEXT:    v_mov_b32_e32 v3, v1
1551; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1552; SI-NEXT:    s_waitcnt lgkmcnt(0)
1553; SI-NEXT:    s_mov_b64 s[4:5], s[10:11]
1554; SI-NEXT:    buffer_load_dword v2, v[2:3], s[4:7], 0 addr64 glc
1555; SI-NEXT:    s_waitcnt vmcnt(0)
1556; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 glc
1557; SI-NEXT:    s_waitcnt vmcnt(0)
1558; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
1559; SI-NEXT:    v_and_b32_e32 v3, 1, v3
1560; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
1561; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v3
1562; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1563; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
1564; SI-NEXT:    buffer_store_byte v2, v[0:1], s[8:11], 0 addr64
1565; SI-NEXT:    s_endpgm
1566;
1567; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
1568; VI:       ; %bb.0:
1569; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1570; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1571; VI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1572; VI-NEXT:    s_waitcnt lgkmcnt(0)
1573; VI-NEXT:    v_mov_b32_e32 v2, s3
1574; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
1575; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
1576; VI-NEXT:    v_mov_b32_e32 v4, s5
1577; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v0
1578; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
1579; VI-NEXT:    flat_load_dword v2, v[1:2] glc
1580; VI-NEXT:    s_waitcnt vmcnt(0)
1581; VI-NEXT:    flat_load_ubyte v3, v[3:4] glc
1582; VI-NEXT:    s_waitcnt vmcnt(0)
1583; VI-NEXT:    v_mov_b32_e32 v1, s1
1584; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1585; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1586; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
1587; VI-NEXT:    v_and_b32_e32 v3, 1, v3
1588; VI-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v3
1589; VI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1590; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
1591; VI-NEXT:    flat_store_byte v[0:1], v2
1592; VI-NEXT:    s_endpgm
1593;
1594; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
1595; GFX10:       ; %bb.0:
1596; GFX10-NEXT:    s_clause 0x1
1597; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
1598; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1599; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1600; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1601; GFX10-NEXT:    global_load_dword v2, v1, s[10:11] glc dlc
1602; GFX10-NEXT:    s_waitcnt vmcnt(0)
1603; GFX10-NEXT:    global_load_ubyte v3, v0, s[0:1] glc dlc
1604; GFX10-NEXT:    s_waitcnt vmcnt(0)
1605; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
1606; GFX10-NEXT:    v_and_b32_e32 v1, 1, v3
1607; GFX10-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v1
1608; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1609; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
1610; GFX10-NEXT:    global_store_byte v0, v1, s[8:9]
1611; GFX10-NEXT:    s_endpgm
1612;
1613; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
1614; GFX11:       ; %bb.0:
1615; GFX11-NEXT:    s_clause 0x1
1616; GFX11-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
1617; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
1618; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1619; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1620; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1621; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1622; GFX11-NEXT:    global_load_b32 v1, v1, s[10:11] glc dlc
1623; GFX11-NEXT:    s_waitcnt vmcnt(0)
1624; GFX11-NEXT:    global_load_u8 v2, v0, s[0:1] glc dlc
1625; GFX11-NEXT:    s_waitcnt vmcnt(0)
1626; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
1627; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
1628; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1629; GFX11-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v2
1630; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1631; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1632; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
1633; GFX11-NEXT:    global_store_b8 v0, v1, s[8:9]
1634; GFX11-NEXT:    s_endpgm
1635;
1636; GFX12-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
1637; GFX12:       ; %bb.0:
1638; GFX12-NEXT:    s_clause 0x1
1639; GFX12-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
1640; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
1641; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1642; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1643; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1644; GFX12-NEXT:    s_wait_kmcnt 0x0
1645; GFX12-NEXT:    global_load_b32 v1, v1, s[10:11] scope:SCOPE_SYS
1646; GFX12-NEXT:    s_wait_loadcnt 0x0
1647; GFX12-NEXT:    global_load_u8 v2, v0, s[0:1] scope:SCOPE_SYS
1648; GFX12-NEXT:    s_wait_loadcnt 0x0
1649; GFX12-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
1650; GFX12-NEXT:    v_and_b32_e32 v2, 1, v2
1651; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1652; GFX12-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v2
1653; GFX12-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1654; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1655; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
1656; GFX12-NEXT:    global_store_b8 v0, v1, s[8:9]
1657; GFX12-NEXT:    s_endpgm
1658  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1659  %tid.ext = sext i32 %tid to i64
1660  %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
1661  %z.gep = getelementptr inbounds i1, ptr addrspace(1) %z.ptr, i64 %tid.ext
1662  %out.gep = getelementptr inbounds i1, ptr addrspace(1) %out, i64 %tid.ext
1663  %x = load volatile i32, ptr addrspace(1) %x.gep
1664  %z = load volatile i1, ptr addrspace(1) %z.gep
1665  %setcc = icmp slt i32 %x, 0
1666  %select = select i1 %setcc, i1 true, i1 %z
1667  store i1 %select, ptr addrspace(1) %out.gep
1668  ret void
1669}
1670
1671; Different types compared vs. selected
1672define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1673; SI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
1674; SI:       ; %bb.0:
1675; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1676; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1677; SI-NEXT:    s_mov_b32 s11, 0xf000
1678; SI-NEXT:    s_mov_b32 s10, 0
1679; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1680; SI-NEXT:    v_mov_b32_e32 v2, 0
1681; SI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
1682; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
1683; SI-NEXT:    v_mov_b32_e32 v4, v2
1684; SI-NEXT:    s_waitcnt lgkmcnt(0)
1685; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1686; SI-NEXT:    buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc
1687; SI-NEXT:    s_waitcnt vmcnt(0)
1688; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64 glc
1689; SI-NEXT:    s_waitcnt vmcnt(0)
1690; SI-NEXT:    v_mov_b32_e32 v5, 0x3ff00000
1691; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
1692; SI-NEXT:    v_cmp_le_f32_e32 vcc, 0, v2
1693; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
1694; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1695; SI-NEXT:    buffer_store_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64
1696; SI-NEXT:    s_endpgm
1697;
1698; VI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
1699; VI:       ; %bb.0:
1700; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1701; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1702; VI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1703; VI-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
1704; VI-NEXT:    s_waitcnt lgkmcnt(0)
1705; VI-NEXT:    v_mov_b32_e32 v2, s3
1706; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
1707; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
1708; VI-NEXT:    v_mov_b32_e32 v0, s5
1709; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v5
1710; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v0, vcc
1711; VI-NEXT:    flat_load_dword v6, v[1:2] glc
1712; VI-NEXT:    s_waitcnt vmcnt(0)
1713; VI-NEXT:    flat_load_dwordx2 v[0:1], v[3:4] glc
1714; VI-NEXT:    s_waitcnt vmcnt(0)
1715; VI-NEXT:    v_mov_b32_e32 v3, s1
1716; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v5
1717; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1718; VI-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
1719; VI-NEXT:    v_cmp_le_f32_e32 vcc, 0, v6
1720; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
1721; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1722; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1723; VI-NEXT:    s_endpgm
1724;
1725; GFX10-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
1726; GFX10:       ; %bb.0:
1727; GFX10-NEXT:    s_clause 0x1
1728; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1729; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1730; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1731; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
1732; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1733; GFX10-NEXT:    global_load_dword v4, v2, s[2:3] glc dlc
1734; GFX10-NEXT:    s_waitcnt vmcnt(0)
1735; GFX10-NEXT:    global_load_dwordx2 v[0:1], v3, s[6:7] glc dlc
1736; GFX10-NEXT:    s_waitcnt vmcnt(0)
1737; GFX10-NEXT:    v_cmp_le_f32_e32 vcc, 0, v4
1738; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc
1739; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1740; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
1741; GFX10-NEXT:    s_endpgm
1742;
1743; GFX11-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
1744; GFX11:       ; %bb.0:
1745; GFX11-NEXT:    s_clause 0x1
1746; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1747; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1748; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1749; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1750; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1751; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1752; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1753; GFX11-NEXT:    global_load_b32 v3, v1, s[2:3] glc dlc
1754; GFX11-NEXT:    s_waitcnt vmcnt(0)
1755; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[4:5] glc dlc
1756; GFX11-NEXT:    s_waitcnt vmcnt(0)
1757; GFX11-NEXT:    v_cmp_le_f32_e32 vcc, 0, v3
1758; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc
1759; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1760; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1761; GFX11-NEXT:    s_endpgm
1762;
1763; GFX12-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
1764; GFX12:       ; %bb.0:
1765; GFX12-NEXT:    s_clause 0x1
1766; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1767; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1768; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1769; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1770; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1771; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1772; GFX12-NEXT:    s_wait_kmcnt 0x0
1773; GFX12-NEXT:    global_load_b32 v3, v1, s[2:3] scope:SCOPE_SYS
1774; GFX12-NEXT:    s_wait_loadcnt 0x0
1775; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[4:5] scope:SCOPE_SYS
1776; GFX12-NEXT:    s_wait_loadcnt 0x0
1777; GFX12-NEXT:    v_cmp_le_f32_e32 vcc, 0, v3
1778; GFX12-NEXT:    v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc
1779; GFX12-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1780; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1781; GFX12-NEXT:    s_endpgm
1782  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1783  %tid.ext = sext i32 %tid to i64
1784  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
1785  %z.gep = getelementptr inbounds double, ptr addrspace(1) %z.ptr, i64 %tid.ext
1786  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
1787  %x = load volatile float, ptr addrspace(1) %x.gep
1788  %z = load volatile double, ptr addrspace(1) %z.gep
1789  %setcc = fcmp ult float %x, 0.0
1790  %select = select i1 %setcc, double 1.0, double %z
1791  store double %select, ptr addrspace(1) %out.gep
1792  ret void
1793}
1794
1795; Different types compared vs. selected
1796define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1797; SI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
1798; SI:       ; %bb.0:
1799; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1800; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1801; SI-NEXT:    s_mov_b32 s11, 0xf000
1802; SI-NEXT:    s_mov_b32 s10, 0
1803; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1804; SI-NEXT:    v_mov_b32_e32 v2, 0
1805; SI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
1806; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
1807; SI-NEXT:    v_mov_b32_e32 v4, v2
1808; SI-NEXT:    s_waitcnt lgkmcnt(0)
1809; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1810; SI-NEXT:    buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc
1811; SI-NEXT:    s_waitcnt vmcnt(0)
1812; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64 glc
1813; SI-NEXT:    s_waitcnt vmcnt(0)
1814; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
1815; SI-NEXT:    v_cmp_nlg_f32_e32 vcc, 0, v2
1816; SI-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
1817; SI-NEXT:    v_cndmask_b32_e32 v0, 3, v0, vcc
1818; SI-NEXT:    buffer_store_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64
1819; SI-NEXT:    s_endpgm
1820;
1821; VI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
1822; VI:       ; %bb.0:
1823; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1824; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1825; VI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1826; VI-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
1827; VI-NEXT:    s_waitcnt lgkmcnt(0)
1828; VI-NEXT:    v_mov_b32_e32 v2, s3
1829; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
1830; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
1831; VI-NEXT:    v_mov_b32_e32 v0, s5
1832; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v5
1833; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v0, vcc
1834; VI-NEXT:    flat_load_dword v6, v[1:2] glc
1835; VI-NEXT:    s_waitcnt vmcnt(0)
1836; VI-NEXT:    flat_load_dwordx2 v[0:1], v[3:4] glc
1837; VI-NEXT:    s_waitcnt vmcnt(0)
1838; VI-NEXT:    v_mov_b32_e32 v3, s1
1839; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v5
1840; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1841; VI-NEXT:    v_cmp_nlg_f32_e32 vcc, 0, v6
1842; VI-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
1843; VI-NEXT:    v_cndmask_b32_e32 v0, 3, v0, vcc
1844; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1845; VI-NEXT:    s_endpgm
1846;
1847; GFX10-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
1848; GFX10:       ; %bb.0:
1849; GFX10-NEXT:    s_clause 0x1
1850; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1851; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1852; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1853; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
1854; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1855; GFX10-NEXT:    global_load_dword v4, v2, s[2:3] glc dlc
1856; GFX10-NEXT:    s_waitcnt vmcnt(0)
1857; GFX10-NEXT:    global_load_dwordx2 v[0:1], v3, s[6:7] glc dlc
1858; GFX10-NEXT:    s_waitcnt vmcnt(0)
1859; GFX10-NEXT:    v_cmp_nlg_f32_e32 vcc, 0, v4
1860; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
1861; GFX10-NEXT:    v_cndmask_b32_e32 v0, 3, v0, vcc
1862; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
1863; GFX10-NEXT:    s_endpgm
1864;
1865; GFX11-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
1866; GFX11:       ; %bb.0:
1867; GFX11-NEXT:    s_clause 0x1
1868; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1869; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1870; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1871; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1872; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1873; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1874; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1875; GFX11-NEXT:    global_load_b32 v3, v1, s[2:3] glc dlc
1876; GFX11-NEXT:    s_waitcnt vmcnt(0)
1877; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[4:5] glc dlc
1878; GFX11-NEXT:    s_waitcnt vmcnt(0)
1879; GFX11-NEXT:    v_cmp_nlg_f32_e32 vcc, 0, v3
1880; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
1881; GFX11-NEXT:    v_cndmask_b32_e32 v0, 3, v0, vcc
1882; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1883; GFX11-NEXT:    s_endpgm
1884;
1885; GFX12-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
1886; GFX12:       ; %bb.0:
1887; GFX12-NEXT:    s_clause 0x1
1888; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1889; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1890; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1891; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1892; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1893; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1894; GFX12-NEXT:    s_wait_kmcnt 0x0
1895; GFX12-NEXT:    global_load_b32 v3, v1, s[2:3] scope:SCOPE_SYS
1896; GFX12-NEXT:    s_wait_loadcnt 0x0
1897; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[4:5] scope:SCOPE_SYS
1898; GFX12-NEXT:    s_wait_loadcnt 0x0
1899; GFX12-NEXT:    v_cmp_nlg_f32_e32 vcc, 0, v3
1900; GFX12-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
1901; GFX12-NEXT:    v_cndmask_b32_e32 v0, 3, v0, vcc
1902; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1903; GFX12-NEXT:    s_endpgm
1904  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1905  %tid.ext = sext i32 %tid to i64
1906  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
1907  %z.gep = getelementptr inbounds i64, ptr addrspace(1) %z.ptr, i64 %tid.ext
1908  %out.gep = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %tid.ext
1909  %x = load volatile float, ptr addrspace(1) %x.gep
1910  %z = load volatile i64, ptr addrspace(1) %z.gep
1911  %setcc = fcmp one float %x, 0.0
1912  %select = select i1 %setcc, i64 3, i64 %z
1913  store i64 %select, ptr addrspace(1) %out.gep
1914  ret void
1915}
1916
1917; Different types compared vs. selected
1918define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1919; SI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
1920; SI:       ; %bb.0:
1921; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1922; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1923; SI-NEXT:    s_mov_b32 s11, 0xf000
1924; SI-NEXT:    s_mov_b32 s10, 0
1925; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1926; SI-NEXT:    v_mov_b32_e32 v1, 0
1927; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
1928; SI-NEXT:    s_waitcnt lgkmcnt(0)
1929; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1930; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
1931; SI-NEXT:    s_waitcnt vmcnt(0)
1932; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
1933; SI-NEXT:    s_waitcnt vmcnt(0)
1934; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
1935; SI-NEXT:    v_cmp_gt_u32_e32 vcc, 2, v2
1936; SI-NEXT:    v_cndmask_b32_e32 v2, 4.0, v3, vcc
1937; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1938; SI-NEXT:    s_endpgm
1939;
1940; VI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
1941; VI:       ; %bb.0:
1942; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1943; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1944; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
1945; VI-NEXT:    s_waitcnt lgkmcnt(0)
1946; VI-NEXT:    v_mov_b32_e32 v1, s3
1947; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
1948; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1949; VI-NEXT:    v_mov_b32_e32 v3, s5
1950; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
1951; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1952; VI-NEXT:    flat_load_dword v5, v[0:1] glc
1953; VI-NEXT:    s_waitcnt vmcnt(0)
1954; VI-NEXT:    flat_load_dword v2, v[2:3] glc
1955; VI-NEXT:    s_waitcnt vmcnt(0)
1956; VI-NEXT:    v_mov_b32_e32 v1, s1
1957; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
1958; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1959; VI-NEXT:    v_cmp_gt_u32_e32 vcc, 2, v5
1960; VI-NEXT:    v_cndmask_b32_e32 v2, 4.0, v2, vcc
1961; VI-NEXT:    flat_store_dword v[0:1], v2
1962; VI-NEXT:    s_endpgm
1963;
1964; GFX10-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
1965; GFX10:       ; %bb.0:
1966; GFX10-NEXT:    s_clause 0x1
1967; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1968; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1969; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1970; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1971; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
1972; GFX10-NEXT:    s_waitcnt vmcnt(0)
1973; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
1974; GFX10-NEXT:    s_waitcnt vmcnt(0)
1975; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc, 2, v1
1976; GFX10-NEXT:    v_cndmask_b32_e32 v1, 4.0, v2, vcc
1977; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1978; GFX10-NEXT:    s_endpgm
1979;
1980; GFX11-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
1981; GFX11:       ; %bb.0:
1982; GFX11-NEXT:    s_clause 0x1
1983; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1984; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1985; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1986; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1987; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1988; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1989; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
1990; GFX11-NEXT:    s_waitcnt vmcnt(0)
1991; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5] glc dlc
1992; GFX11-NEXT:    s_waitcnt vmcnt(0)
1993; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc, 2, v1
1994; GFX11-NEXT:    v_cndmask_b32_e32 v1, 4.0, v2, vcc
1995; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1996; GFX11-NEXT:    s_endpgm
1997;
1998; GFX12-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
1999; GFX12:       ; %bb.0:
2000; GFX12-NEXT:    s_clause 0x1
2001; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2002; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2003; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2004; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2005; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2006; GFX12-NEXT:    s_wait_kmcnt 0x0
2007; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
2008; GFX12-NEXT:    s_wait_loadcnt 0x0
2009; GFX12-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
2010; GFX12-NEXT:    s_wait_loadcnt 0x0
2011; GFX12-NEXT:    v_cmp_gt_u32_e32 vcc, 2, v1
2012; GFX12-NEXT:    v_cndmask_b32_e32 v1, 4.0, v2, vcc
2013; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
2014; GFX12-NEXT:    s_endpgm
2015  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2016  %tid.ext = sext i32 %tid to i64
2017  %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
2018  %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
2019  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2020  %x = load volatile i32, ptr addrspace(1) %x.gep
2021  %z = load volatile float, ptr addrspace(1) %z.gep
2022  %setcc = icmp ugt i32 %x, 1
2023  %select = select i1 %setcc, float 4.0, float %z
2024  store float %select, ptr addrspace(1) %out.gep
2025  ret void
2026}
2027
2028; FIXME: Should be able to handle multiple uses
2029define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
2030; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
2031; SI:       ; %bb.0:
2032; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2033; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
2034; SI-NEXT:    s_mov_b32 s11, 0xf000
2035; SI-NEXT:    s_mov_b32 s10, 0
2036; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2037; SI-NEXT:    v_mov_b32_e32 v1, 0
2038; SI-NEXT:    s_mov_b64 s[6:7], s[10:11]
2039; SI-NEXT:    s_waitcnt lgkmcnt(0)
2040; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
2041; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
2042; SI-NEXT:    s_waitcnt vmcnt(0)
2043; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
2044; SI-NEXT:    s_waitcnt vmcnt(0)
2045; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
2046; SI-NEXT:    v_cmp_nle_f32_e32 vcc, 4.0, v2
2047; SI-NEXT:    v_cndmask_b32_e64 v2, v3, -1.0, vcc
2048; SI-NEXT:    v_cndmask_b32_e64 v3, v3, -2.0, vcc
2049; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2050; SI-NEXT:    s_waitcnt vmcnt(0)
2051; SI-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
2052; SI-NEXT:    s_waitcnt vmcnt(0)
2053; SI-NEXT:    s_endpgm
2054;
2055; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
2056; VI:       ; %bb.0:
2057; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2058; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
2059; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
2060; VI-NEXT:    s_waitcnt lgkmcnt(0)
2061; VI-NEXT:    v_mov_b32_e32 v1, s3
2062; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
2063; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2064; VI-NEXT:    v_mov_b32_e32 v3, s5
2065; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
2066; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2067; VI-NEXT:    flat_load_dword v5, v[0:1] glc
2068; VI-NEXT:    s_waitcnt vmcnt(0)
2069; VI-NEXT:    flat_load_dword v2, v[2:3] glc
2070; VI-NEXT:    s_waitcnt vmcnt(0)
2071; VI-NEXT:    v_mov_b32_e32 v1, s1
2072; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
2073; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2074; VI-NEXT:    v_cmp_nle_f32_e32 vcc, 4.0, v5
2075; VI-NEXT:    v_cndmask_b32_e64 v3, v2, -1.0, vcc
2076; VI-NEXT:    v_cndmask_b32_e64 v2, v2, -2.0, vcc
2077; VI-NEXT:    flat_store_dword v[0:1], v3
2078; VI-NEXT:    s_waitcnt vmcnt(0)
2079; VI-NEXT:    flat_store_dword v[0:1], v2
2080; VI-NEXT:    s_waitcnt vmcnt(0)
2081; VI-NEXT:    s_endpgm
2082;
2083; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
2084; GFX10:       ; %bb.0:
2085; GFX10-NEXT:    s_clause 0x1
2086; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2087; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2088; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2089; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2090; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
2091; GFX10-NEXT:    s_waitcnt vmcnt(0)
2092; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
2093; GFX10-NEXT:    s_waitcnt vmcnt(0)
2094; GFX10-NEXT:    v_cmp_nle_f32_e32 vcc, 4.0, v1
2095; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, -1.0, vcc
2096; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, -2.0, vcc
2097; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
2098; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2099; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
2100; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2101; GFX10-NEXT:    s_endpgm
2102;
2103; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
2104; GFX11:       ; %bb.0:
2105; GFX11-NEXT:    s_clause 0x1
2106; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2107; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2108; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2109; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2110; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2111; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2112; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
2113; GFX11-NEXT:    s_waitcnt vmcnt(0)
2114; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5] glc dlc
2115; GFX11-NEXT:    s_waitcnt vmcnt(0)
2116; GFX11-NEXT:    v_cmp_nle_f32_e32 vcc, 4.0, v1
2117; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, -1.0, vcc
2118; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, -2.0, vcc
2119; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
2120; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2121; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1] dlc
2122; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2123; GFX11-NEXT:    s_endpgm
2124;
2125; GFX12-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
2126; GFX12:       ; %bb.0:
2127; GFX12-NEXT:    s_clause 0x1
2128; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2129; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2130; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2131; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2132; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2133; GFX12-NEXT:    s_wait_kmcnt 0x0
2134; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
2135; GFX12-NEXT:    s_wait_loadcnt 0x0
2136; GFX12-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
2137; GFX12-NEXT:    s_wait_loadcnt 0x0
2138; GFX12-NEXT:    v_cmp_nle_f32_e32 vcc, 4.0, v1
2139; GFX12-NEXT:    v_cndmask_b32_e64 v1, v2, -1.0, vcc
2140; GFX12-NEXT:    v_cndmask_b32_e64 v2, v2, -2.0, vcc
2141; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
2142; GFX12-NEXT:    s_wait_storecnt 0x0
2143; GFX12-NEXT:    global_store_b32 v0, v2, s[0:1] scope:SCOPE_SYS
2144; GFX12-NEXT:    s_wait_storecnt 0x0
2145; GFX12-NEXT:    s_endpgm
2146  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2147  %tid.ext = sext i32 %tid to i64
2148  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
2149  %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
2150  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
2151  %x = load volatile float, ptr addrspace(1) %x.gep
2152  %z = load volatile float, ptr addrspace(1) %z.gep
2153  %setcc = fcmp ugt float 4.0, %x
2154  %select0 = select i1 %setcc, float -1.0, float %z
2155  %select1 = select i1 %setcc, float -2.0, float %z
2156  store volatile float %select0, ptr addrspace(1) %out.gep
2157  store volatile float %select1, ptr addrspace(1) %out.gep
2158  ret void
2159}
2160
2161; Source modifiers abs/neg only work for f32
2162define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
2163; SI-LABEL: v_cndmask_abs_neg_f16:
2164; SI:       ; %bb.0:
2165; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
2166; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
2167; SI-NEXT:    s_mov_b32 s7, 0xf000
2168; SI-NEXT:    s_mov_b32 s2, 0
2169; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2170; SI-NEXT:    v_mov_b32_e32 v1, 0
2171; SI-NEXT:    s_mov_b32 s3, s7
2172; SI-NEXT:    s_waitcnt lgkmcnt(0)
2173; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
2174; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
2175; SI-NEXT:    s_mov_b32 s6, -1
2176; SI-NEXT:    s_cmp_lg_u32 s8, 0
2177; SI-NEXT:    s_waitcnt vmcnt(0)
2178; SI-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
2179; SI-NEXT:    v_cvt_f32_f16_e64 v0, -v0
2180; SI-NEXT:    s_cselect_b64 vcc, -1, 0
2181; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
2182; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2183; SI-NEXT:    s_waitcnt lgkmcnt(0)
2184; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
2185; SI-NEXT:    s_endpgm
2186;
2187; VI-LABEL: v_cndmask_abs_neg_f16:
2188; VI:       ; %bb.0:
2189; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2190; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2191; VI-NEXT:    s_waitcnt lgkmcnt(0)
2192; VI-NEXT:    v_mov_b32_e32 v1, s1
2193; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
2194; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2195; VI-NEXT:    flat_load_ushort v0, v[0:1]
2196; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
2197; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2198; VI-NEXT:    s_waitcnt lgkmcnt(0)
2199; VI-NEXT:    s_cmp_lg_u32 s2, 0
2200; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2201; VI-NEXT:    s_waitcnt vmcnt(0)
2202; VI-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
2203; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
2204; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
2205; VI-NEXT:    v_mov_b32_e32 v0, s0
2206; VI-NEXT:    v_mov_b32_e32 v1, s1
2207; VI-NEXT:    flat_store_short v[0:1], v2
2208; VI-NEXT:    s_endpgm
2209;
2210; GFX10-LABEL: v_cndmask_abs_neg_f16:
2211; GFX10:       ; %bb.0:
2212; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2213; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2214; GFX10-NEXT:    v_mov_b32_e32 v2, 0
2215; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2216; GFX10-NEXT:    global_load_ushort v0, v0, s[0:1]
2217; GFX10-NEXT:    s_clause 0x1
2218; GFX10-NEXT:    s_load_dword s2, s[4:5], 0x2c
2219; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
2220; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2221; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2222; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
2223; GFX10-NEXT:    s_cselect_b64 vcc, -1, 0
2224; GFX10-NEXT:    s_waitcnt vmcnt(0)
2225; GFX10-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
2226; GFX10-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
2227; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
2228; GFX10-NEXT:    global_store_short v2, v0, s[0:1]
2229; GFX10-NEXT:    s_endpgm
2230;
2231; GFX11-LABEL: v_cndmask_abs_neg_f16:
2232; GFX11:       ; %bb.0:
2233; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
2234; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2235; GFX11-NEXT:    v_mov_b32_e32 v2, 0
2236; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2237; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2238; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2239; GFX11-NEXT:    global_load_u16 v0, v0, s[0:1]
2240; GFX11-NEXT:    s_clause 0x1
2241; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
2242; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2243; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2244; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
2245; GFX11-NEXT:    s_cselect_b64 vcc, -1, 0
2246; GFX11-NEXT:    s_waitcnt vmcnt(0)
2247; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
2248; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
2249; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2250; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
2251; GFX11-NEXT:    global_store_b16 v2, v0, s[0:1]
2252; GFX11-NEXT:    s_endpgm
2253;
2254; GFX12-LABEL: v_cndmask_abs_neg_f16:
2255; GFX12:       ; %bb.0:
2256; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
2257; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2258; GFX12-NEXT:    v_mov_b32_e32 v2, 0
2259; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2260; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2261; GFX12-NEXT:    s_wait_kmcnt 0x0
2262; GFX12-NEXT:    global_load_u16 v0, v0, s[0:1]
2263; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
2264; GFX12-NEXT:    s_wait_kmcnt 0x0
2265; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
2266; GFX12-NEXT:    s_cselect_b64 vcc, -1, 0
2267; GFX12-NEXT:    s_wait_loadcnt 0x0
2268; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
2269; GFX12-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
2270; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2271; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
2272; GFX12-NEXT:    global_store_b16 v2, v0, s[0:1]
2273; GFX12-NEXT:    s_endpgm
2274  %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
2275  %f.gep = getelementptr half, ptr addrspace(1) %fptr, i32 %idx
2276  %f = load half, ptr addrspace(1) %f.gep
2277  %f.abs = call half @llvm.fabs.f16(half %f)
2278  %f.neg = fneg half %f
2279  %setcc = icmp ne i32 %c, 0
2280  %select = select i1 %setcc, half %f.abs, half %f.neg
2281  store half %select, ptr addrspace(1) %out
2282  ret void
2283}
2284
2285define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
2286; SI-LABEL: v_cndmask_abs_neg_f32:
2287; SI:       ; %bb.0:
2288; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2289; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
2290; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
2291; SI-NEXT:    s_mov_b32 s3, 0xf000
2292; SI-NEXT:    s_mov_b32 s6, 0
2293; SI-NEXT:    s_mov_b32 s7, s3
2294; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2295; SI-NEXT:    v_mov_b32_e32 v1, 0
2296; SI-NEXT:    s_waitcnt lgkmcnt(0)
2297; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2298; SI-NEXT:    s_mov_b32 s2, -1
2299; SI-NEXT:    s_cmp_lg_u32 s8, 0
2300; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
2301; SI-NEXT:    s_waitcnt vmcnt(0)
2302; SI-NEXT:    v_cndmask_b32_e64 v0, -v0, |v0|, s[4:5]
2303; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2304; SI-NEXT:    s_endpgm
2305;
2306; VI-LABEL: v_cndmask_abs_neg_f32:
2307; VI:       ; %bb.0:
2308; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2309; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2310; VI-NEXT:    s_waitcnt lgkmcnt(0)
2311; VI-NEXT:    v_mov_b32_e32 v1, s1
2312; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
2313; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2314; VI-NEXT:    flat_load_dword v0, v[0:1]
2315; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
2316; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2317; VI-NEXT:    s_waitcnt lgkmcnt(0)
2318; VI-NEXT:    s_cmp_lg_u32 s2, 0
2319; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
2320; VI-NEXT:    s_waitcnt vmcnt(0)
2321; VI-NEXT:    v_cndmask_b32_e64 v2, -v0, |v0|, s[2:3]
2322; VI-NEXT:    v_mov_b32_e32 v0, s0
2323; VI-NEXT:    v_mov_b32_e32 v1, s1
2324; VI-NEXT:    flat_store_dword v[0:1], v2
2325; VI-NEXT:    s_endpgm
2326;
2327; GFX10-LABEL: v_cndmask_abs_neg_f32:
2328; GFX10:       ; %bb.0:
2329; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2330; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2331; GFX10-NEXT:    v_mov_b32_e32 v1, 0
2332; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2333; GFX10-NEXT:    global_load_dword v0, v0, s[0:1]
2334; GFX10-NEXT:    s_clause 0x1
2335; GFX10-NEXT:    s_load_dword s2, s[4:5], 0x2c
2336; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
2337; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2338; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2339; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
2340; GFX10-NEXT:    s_cselect_b64 s[2:3], -1, 0
2341; GFX10-NEXT:    s_waitcnt vmcnt(0)
2342; GFX10-NEXT:    v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3]
2343; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
2344; GFX10-NEXT:    s_endpgm
2345;
2346; GFX11-LABEL: v_cndmask_abs_neg_f32:
2347; GFX11:       ; %bb.0:
2348; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
2349; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2350; GFX11-NEXT:    v_mov_b32_e32 v1, 0
2351; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2352; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2353; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2354; GFX11-NEXT:    global_load_b32 v0, v0, s[0:1]
2355; GFX11-NEXT:    s_clause 0x1
2356; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
2357; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2358; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2359; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
2360; GFX11-NEXT:    s_cselect_b64 s[2:3], -1, 0
2361; GFX11-NEXT:    s_waitcnt vmcnt(0)
2362; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3]
2363; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
2364; GFX11-NEXT:    s_endpgm
2365;
2366; GFX12-LABEL: v_cndmask_abs_neg_f32:
2367; GFX12:       ; %bb.0:
2368; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
2369; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2370; GFX12-NEXT:    v_mov_b32_e32 v1, 0
2371; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2372; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2373; GFX12-NEXT:    s_wait_kmcnt 0x0
2374; GFX12-NEXT:    global_load_b32 v0, v0, s[0:1]
2375; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
2376; GFX12-NEXT:    s_wait_kmcnt 0x0
2377; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
2378; GFX12-NEXT:    s_cselect_b64 s[2:3], -1, 0
2379; GFX12-NEXT:    s_wait_loadcnt 0x0
2380; GFX12-NEXT:    v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3]
2381; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
2382; GFX12-NEXT:    s_endpgm
2383  %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
2384  %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx
2385  %f = load float, ptr addrspace(1) %f.gep
2386  %f.abs = call float @llvm.fabs.f32(float %f)
2387  %f.neg = fneg float %f
2388  %setcc = icmp ne i32 %c, 0
2389  %select = select i1 %setcc, float %f.abs, float %f.neg
2390  store float %select, ptr addrspace(1) %out
2391  ret void
2392}
2393
2394define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
2395; SI-LABEL: v_cndmask_abs_neg_f64:
2396; SI:       ; %bb.0:
2397; SI-NEXT:    s_load_dword s8, s[4:5], 0xb
2398; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
2399; SI-NEXT:    s_mov_b32 s7, 0xf000
2400; SI-NEXT:    s_mov_b32 s2, 0
2401; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
2402; SI-NEXT:    v_mov_b32_e32 v1, 0
2403; SI-NEXT:    s_mov_b32 s3, s7
2404; SI-NEXT:    s_waitcnt lgkmcnt(0)
2405; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
2406; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
2407; SI-NEXT:    s_mov_b32 s6, -1
2408; SI-NEXT:    s_cmp_lg_u32 s8, 0
2409; SI-NEXT:    s_waitcnt vmcnt(0)
2410; SI-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
2411; SI-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
2412; SI-NEXT:    s_cselect_b64 vcc, -1, 0
2413; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
2414; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v0, vcc
2415; SI-NEXT:    s_waitcnt lgkmcnt(0)
2416; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2417; SI-NEXT:    s_endpgm
2418;
2419; VI-LABEL: v_cndmask_abs_neg_f64:
2420; VI:       ; %bb.0:
2421; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2422; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
2423; VI-NEXT:    s_waitcnt lgkmcnt(0)
2424; VI-NEXT:    v_mov_b32_e32 v1, s1
2425; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
2426; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2427; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
2428; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
2429; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2430; VI-NEXT:    s_waitcnt lgkmcnt(0)
2431; VI-NEXT:    s_cmp_lg_u32 s2, 0
2432; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2433; VI-NEXT:    s_waitcnt vmcnt(0)
2434; VI-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
2435; VI-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
2436; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
2437; VI-NEXT:    v_mov_b32_e32 v3, s1
2438; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v0, vcc
2439; VI-NEXT:    v_mov_b32_e32 v2, s0
2440; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2441; VI-NEXT:    s_endpgm
2442;
2443; GFX10-LABEL: v_cndmask_abs_neg_f64:
2444; GFX10:       ; %bb.0:
2445; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2446; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
2447; GFX10-NEXT:    v_mov_b32_e32 v3, 0
2448; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2449; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1]
2450; GFX10-NEXT:    s_clause 0x1
2451; GFX10-NEXT:    s_load_dword s2, s[4:5], 0x2c
2452; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
2453; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2454; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2455; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
2456; GFX10-NEXT:    s_cselect_b64 vcc, -1, 0
2457; GFX10-NEXT:    s_waitcnt vmcnt(0)
2458; GFX10-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
2459; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
2460; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v0, vcc
2461; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
2462; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
2463; GFX10-NEXT:    s_endpgm
2464;
2465; GFX11-LABEL: v_cndmask_abs_neg_f64:
2466; GFX11:       ; %bb.0:
2467; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
2468; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2469; GFX11-NEXT:    v_mov_b32_e32 v3, 0
2470; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2471; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
2472; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2473; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[0:1]
2474; GFX11-NEXT:    s_clause 0x1
2475; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
2476; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2477; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2478; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
2479; GFX11-NEXT:    s_cselect_b64 vcc, -1, 0
2480; GFX11-NEXT:    s_waitcnt vmcnt(0)
2481; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
2482; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
2483; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v0, vcc
2484; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2485; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
2486; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
2487; GFX11-NEXT:    s_endpgm
2488;
2489; GFX12-LABEL: v_cndmask_abs_neg_f64:
2490; GFX12:       ; %bb.0:
2491; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
2492; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2493; GFX12-NEXT:    v_mov_b32_e32 v3, 0
2494; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2495; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
2496; GFX12-NEXT:    s_wait_kmcnt 0x0
2497; GFX12-NEXT:    global_load_b64 v[0:1], v0, s[0:1]
2498; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
2499; GFX12-NEXT:    s_wait_kmcnt 0x0
2500; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
2501; GFX12-NEXT:    s_cselect_b64 vcc, -1, 0
2502; GFX12-NEXT:    s_wait_loadcnt 0x0
2503; GFX12-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
2504; GFX12-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
2505; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v0, vcc
2506; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2507; GFX12-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
2508; GFX12-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
2509; GFX12-NEXT:    s_endpgm
2510  %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
2511  %f.gep = getelementptr double, ptr addrspace(1) %fptr, i32 %idx
2512  %f = load double, ptr addrspace(1) %f.gep
2513  %f.abs = call double @llvm.fabs.f64(double %f)
2514  %f.neg = fneg double %f
2515  %setcc = icmp ne i32 %c, 0
2516  %select = select i1 %setcc, double %f.abs, double %f.neg
2517  store double %select, ptr addrspace(1) %out
2518  ret void
2519}
2520
2521attributes #0 = { nounwind }
2522attributes #1 = { nounwind readnone }
2523