xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll (revision 70632f95664afba831cee7c819a32c56c002e80f)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN:  llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=SI %s
3; RUN:  llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI %s
4; RUN:  llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11 %s
5; RUN:  llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX12 %s
6
7define amdgpu_kernel void @fcmp_f16_lt(
8; SI-LABEL: fcmp_f16_lt:
9; SI:       ; %bb.0: ; %entry
10; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
11; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
12; SI-NEXT:    s_mov_b32 s11, 0xf000
13; SI-NEXT:    s_mov_b32 s10, -1
14; SI-NEXT:    s_mov_b32 s14, s10
15; SI-NEXT:    s_mov_b32 s15, s11
16; SI-NEXT:    s_mov_b32 s6, s10
17; SI-NEXT:    s_mov_b32 s7, s11
18; SI-NEXT:    s_waitcnt lgkmcnt(0)
19; SI-NEXT:    s_mov_b32 s12, s2
20; SI-NEXT:    s_mov_b32 s13, s3
21; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
22; SI-NEXT:    s_waitcnt vmcnt(0)
23; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 glc
24; SI-NEXT:    s_waitcnt vmcnt(0)
25; SI-NEXT:    s_mov_b32 s8, s0
26; SI-NEXT:    s_mov_b32 s9, s1
27; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
28; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
29; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
30; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
31; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
32; SI-NEXT:    s_endpgm
33;
34; VI-LABEL: fcmp_f16_lt:
35; VI:       ; %bb.0: ; %entry
36; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
37; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
38; VI-NEXT:    s_mov_b32 s7, 0xf000
39; VI-NEXT:    s_mov_b32 s6, -1
40; VI-NEXT:    s_mov_b32 s14, s6
41; VI-NEXT:    s_waitcnt lgkmcnt(0)
42; VI-NEXT:    s_mov_b32 s12, s2
43; VI-NEXT:    s_mov_b32 s13, s3
44; VI-NEXT:    s_mov_b32 s15, s7
45; VI-NEXT:    s_mov_b32 s10, s6
46; VI-NEXT:    s_mov_b32 s11, s7
47; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
48; VI-NEXT:    s_waitcnt vmcnt(0)
49; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
50; VI-NEXT:    s_waitcnt vmcnt(0)
51; VI-NEXT:    s_mov_b32 s4, s0
52; VI-NEXT:    s_mov_b32 s5, s1
53; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
54; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
55; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
56; VI-NEXT:    s_endpgm
57;
58; GFX11-LABEL: fcmp_f16_lt:
59; GFX11:       ; %bb.0: ; %entry
60; GFX11-NEXT:    s_clause 0x1
61; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
62; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
63; GFX11-NEXT:    s_mov_b32 s10, -1
64; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
65; GFX11-NEXT:    s_mov_b32 s14, s10
66; GFX11-NEXT:    s_mov_b32 s15, s11
67; GFX11-NEXT:    s_mov_b32 s6, s10
68; GFX11-NEXT:    s_mov_b32 s7, s11
69; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
70; GFX11-NEXT:    s_mov_b32 s12, s2
71; GFX11-NEXT:    s_mov_b32 s13, s3
72; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
73; GFX11-NEXT:    s_waitcnt vmcnt(0)
74; GFX11-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
75; GFX11-NEXT:    s_waitcnt vmcnt(0)
76; GFX11-NEXT:    s_mov_b32 s8, s0
77; GFX11-NEXT:    s_mov_b32 s9, s1
78; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v1
79; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
80; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
81; GFX11-NEXT:    s_endpgm
82;
83; GFX12-LABEL: fcmp_f16_lt:
84; GFX12:       ; %bb.0: ; %entry
85; GFX12-NEXT:    s_clause 0x1
86; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
87; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
88; GFX12-NEXT:    s_mov_b32 s10, -1
89; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
90; GFX12-NEXT:    s_mov_b32 s14, s10
91; GFX12-NEXT:    s_mov_b32 s15, s11
92; GFX12-NEXT:    s_mov_b32 s6, s10
93; GFX12-NEXT:    s_mov_b32 s7, s11
94; GFX12-NEXT:    s_wait_kmcnt 0x0
95; GFX12-NEXT:    s_mov_b32 s12, s2
96; GFX12-NEXT:    s_mov_b32 s13, s3
97; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
98; GFX12-NEXT:    s_wait_loadcnt 0x0
99; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
100; GFX12-NEXT:    s_wait_loadcnt 0x0
101; GFX12-NEXT:    s_mov_b32 s8, s0
102; GFX12-NEXT:    s_mov_b32 s9, s1
103; GFX12-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v1
104; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
105; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
106; GFX12-NEXT:    s_endpgm
107    ptr addrspace(1) %r,
108    ptr addrspace(1) %a,
109    ptr addrspace(1) %b) {
110entry:
111  %a.val = load volatile half, ptr addrspace(1) %a
112  %b.val = load volatile half, ptr addrspace(1) %b
113  %r.val = fcmp olt half %a.val, %b.val
114  %r.val.sext = sext i1 %r.val to i32
115  store i32 %r.val.sext, ptr addrspace(1) %r
116  ret void
117}
118
119define amdgpu_kernel void @fcmp_f16_lt_abs(
120; SI-LABEL: fcmp_f16_lt_abs:
121; SI:       ; %bb.0: ; %entry
122; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
123; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
124; SI-NEXT:    s_mov_b32 s11, 0xf000
125; SI-NEXT:    s_mov_b32 s10, -1
126; SI-NEXT:    s_mov_b32 s14, s10
127; SI-NEXT:    s_mov_b32 s15, s11
128; SI-NEXT:    s_mov_b32 s6, s10
129; SI-NEXT:    s_mov_b32 s7, s11
130; SI-NEXT:    s_waitcnt lgkmcnt(0)
131; SI-NEXT:    s_mov_b32 s12, s2
132; SI-NEXT:    s_mov_b32 s13, s3
133; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
134; SI-NEXT:    s_waitcnt vmcnt(0)
135; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 glc
136; SI-NEXT:    s_waitcnt vmcnt(0)
137; SI-NEXT:    s_mov_b32 s8, s0
138; SI-NEXT:    s_mov_b32 s9, s1
139; SI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
140; SI-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
141; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
142; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
143; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
144; SI-NEXT:    s_endpgm
145;
146; VI-LABEL: fcmp_f16_lt_abs:
147; VI:       ; %bb.0: ; %entry
148; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
149; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
150; VI-NEXT:    s_mov_b32 s7, 0xf000
151; VI-NEXT:    s_mov_b32 s6, -1
152; VI-NEXT:    s_mov_b32 s14, s6
153; VI-NEXT:    s_waitcnt lgkmcnt(0)
154; VI-NEXT:    s_mov_b32 s12, s2
155; VI-NEXT:    s_mov_b32 s13, s3
156; VI-NEXT:    s_mov_b32 s15, s7
157; VI-NEXT:    s_mov_b32 s10, s6
158; VI-NEXT:    s_mov_b32 s11, s7
159; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
160; VI-NEXT:    s_waitcnt vmcnt(0)
161; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
162; VI-NEXT:    s_waitcnt vmcnt(0)
163; VI-NEXT:    s_mov_b32 s4, s0
164; VI-NEXT:    s_mov_b32 s5, s1
165; VI-NEXT:    v_cmp_lt_f16_e64 s[0:1], |v0|, |v1|
166; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
167; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
168; VI-NEXT:    s_endpgm
169;
170; GFX11-LABEL: fcmp_f16_lt_abs:
171; GFX11:       ; %bb.0: ; %entry
172; GFX11-NEXT:    s_clause 0x1
173; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
174; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
175; GFX11-NEXT:    s_mov_b32 s10, -1
176; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
177; GFX11-NEXT:    s_mov_b32 s14, s10
178; GFX11-NEXT:    s_mov_b32 s15, s11
179; GFX11-NEXT:    s_mov_b32 s6, s10
180; GFX11-NEXT:    s_mov_b32 s7, s11
181; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
182; GFX11-NEXT:    s_mov_b32 s12, s2
183; GFX11-NEXT:    s_mov_b32 s13, s3
184; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
185; GFX11-NEXT:    s_waitcnt vmcnt(0)
186; GFX11-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
187; GFX11-NEXT:    s_waitcnt vmcnt(0)
188; GFX11-NEXT:    s_mov_b32 s8, s0
189; GFX11-NEXT:    s_mov_b32 s9, s1
190; GFX11-NEXT:    v_cmp_lt_f16_e64 s2, |v0|, |v1|
191; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
192; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s2
193; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
194; GFX11-NEXT:    s_endpgm
195;
196; GFX12-LABEL: fcmp_f16_lt_abs:
197; GFX12:       ; %bb.0: ; %entry
198; GFX12-NEXT:    s_clause 0x1
199; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
200; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
201; GFX12-NEXT:    s_mov_b32 s10, -1
202; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
203; GFX12-NEXT:    s_mov_b32 s14, s10
204; GFX12-NEXT:    s_mov_b32 s15, s11
205; GFX12-NEXT:    s_mov_b32 s6, s10
206; GFX12-NEXT:    s_mov_b32 s7, s11
207; GFX12-NEXT:    s_wait_kmcnt 0x0
208; GFX12-NEXT:    s_mov_b32 s12, s2
209; GFX12-NEXT:    s_mov_b32 s13, s3
210; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
211; GFX12-NEXT:    s_wait_loadcnt 0x0
212; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
213; GFX12-NEXT:    s_wait_loadcnt 0x0
214; GFX12-NEXT:    s_mov_b32 s8, s0
215; GFX12-NEXT:    s_mov_b32 s9, s1
216; GFX12-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
217; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
218; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
219; GFX12-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v1
220; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
221; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
222; GFX12-NEXT:    s_endpgm
223    ptr addrspace(1) %r,
224    ptr addrspace(1) %a,
225    ptr addrspace(1) %b) {
226entry:
227  %a.val = load volatile half, ptr addrspace(1) %a
228  %b.val = load volatile half, ptr addrspace(1) %b
229  %a.abs = call half @llvm.fabs.f16(half %a.val)
230  %b.abs = call half @llvm.fabs.f16(half %b.val)
231  %r.val = fcmp olt half %a.abs, %b.abs
232  %r.val.sext = sext i1 %r.val to i32
233  store i32 %r.val.sext, ptr addrspace(1) %r
234  ret void
235}
236
237define amdgpu_kernel void @fcmp_f16_eq(
238; SI-LABEL: fcmp_f16_eq:
239; SI:       ; %bb.0: ; %entry
240; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
241; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
242; SI-NEXT:    s_mov_b32 s11, 0xf000
243; SI-NEXT:    s_mov_b32 s10, -1
244; SI-NEXT:    s_mov_b32 s14, s10
245; SI-NEXT:    s_mov_b32 s15, s11
246; SI-NEXT:    s_mov_b32 s6, s10
247; SI-NEXT:    s_mov_b32 s7, s11
248; SI-NEXT:    s_waitcnt lgkmcnt(0)
249; SI-NEXT:    s_mov_b32 s12, s2
250; SI-NEXT:    s_mov_b32 s13, s3
251; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
252; SI-NEXT:    s_waitcnt vmcnt(0)
253; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 glc
254; SI-NEXT:    s_waitcnt vmcnt(0)
255; SI-NEXT:    s_mov_b32 s8, s0
256; SI-NEXT:    s_mov_b32 s9, s1
257; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
258; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
259; SI-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
260; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
261; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
262; SI-NEXT:    s_endpgm
263;
264; VI-LABEL: fcmp_f16_eq:
265; VI:       ; %bb.0: ; %entry
266; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
267; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
268; VI-NEXT:    s_mov_b32 s7, 0xf000
269; VI-NEXT:    s_mov_b32 s6, -1
270; VI-NEXT:    s_mov_b32 s14, s6
271; VI-NEXT:    s_waitcnt lgkmcnt(0)
272; VI-NEXT:    s_mov_b32 s12, s2
273; VI-NEXT:    s_mov_b32 s13, s3
274; VI-NEXT:    s_mov_b32 s15, s7
275; VI-NEXT:    s_mov_b32 s10, s6
276; VI-NEXT:    s_mov_b32 s11, s7
277; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
278; VI-NEXT:    s_waitcnt vmcnt(0)
279; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
280; VI-NEXT:    s_waitcnt vmcnt(0)
281; VI-NEXT:    s_mov_b32 s4, s0
282; VI-NEXT:    s_mov_b32 s5, s1
283; VI-NEXT:    v_cmp_eq_f16_e32 vcc, v0, v1
284; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
285; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
286; VI-NEXT:    s_endpgm
287;
288; GFX11-LABEL: fcmp_f16_eq:
289; GFX11:       ; %bb.0: ; %entry
290; GFX11-NEXT:    s_clause 0x1
291; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
292; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
293; GFX11-NEXT:    s_mov_b32 s10, -1
294; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
295; GFX11-NEXT:    s_mov_b32 s14, s10
296; GFX11-NEXT:    s_mov_b32 s15, s11
297; GFX11-NEXT:    s_mov_b32 s6, s10
298; GFX11-NEXT:    s_mov_b32 s7, s11
299; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
300; GFX11-NEXT:    s_mov_b32 s12, s2
301; GFX11-NEXT:    s_mov_b32 s13, s3
302; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
303; GFX11-NEXT:    s_waitcnt vmcnt(0)
304; GFX11-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
305; GFX11-NEXT:    s_waitcnt vmcnt(0)
306; GFX11-NEXT:    s_mov_b32 s8, s0
307; GFX11-NEXT:    s_mov_b32 s9, s1
308; GFX11-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v0, v1
309; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
310; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
311; GFX11-NEXT:    s_endpgm
312;
313; GFX12-LABEL: fcmp_f16_eq:
314; GFX12:       ; %bb.0: ; %entry
315; GFX12-NEXT:    s_clause 0x1
316; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
317; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
318; GFX12-NEXT:    s_mov_b32 s10, -1
319; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
320; GFX12-NEXT:    s_mov_b32 s14, s10
321; GFX12-NEXT:    s_mov_b32 s15, s11
322; GFX12-NEXT:    s_mov_b32 s6, s10
323; GFX12-NEXT:    s_mov_b32 s7, s11
324; GFX12-NEXT:    s_wait_kmcnt 0x0
325; GFX12-NEXT:    s_mov_b32 s12, s2
326; GFX12-NEXT:    s_mov_b32 s13, s3
327; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
328; GFX12-NEXT:    s_wait_loadcnt 0x0
329; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
330; GFX12-NEXT:    s_wait_loadcnt 0x0
331; GFX12-NEXT:    s_mov_b32 s8, s0
332; GFX12-NEXT:    s_mov_b32 s9, s1
333; GFX12-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v0, v1
334; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
335; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
336; GFX12-NEXT:    s_endpgm
337    ptr addrspace(1) %r,
338    ptr addrspace(1) %a,
339    ptr addrspace(1) %b) {
340entry:
341  %a.val = load volatile half, ptr addrspace(1) %a
342  %b.val = load volatile half, ptr addrspace(1) %b
343  %r.val = fcmp oeq half %a.val, %b.val
344  %r.val.sext = sext i1 %r.val to i32
345  store i32 %r.val.sext, ptr addrspace(1) %r
346  ret void
347}
348
349define amdgpu_kernel void @fcmp_f16_le(
350; SI-LABEL: fcmp_f16_le:
351; SI:       ; %bb.0: ; %entry
352; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
353; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
354; SI-NEXT:    s_mov_b32 s11, 0xf000
355; SI-NEXT:    s_mov_b32 s10, -1
356; SI-NEXT:    s_mov_b32 s14, s10
357; SI-NEXT:    s_mov_b32 s15, s11
358; SI-NEXT:    s_mov_b32 s6, s10
359; SI-NEXT:    s_mov_b32 s7, s11
360; SI-NEXT:    s_waitcnt lgkmcnt(0)
361; SI-NEXT:    s_mov_b32 s12, s2
362; SI-NEXT:    s_mov_b32 s13, s3
363; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
364; SI-NEXT:    s_waitcnt vmcnt(0)
365; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 glc
366; SI-NEXT:    s_waitcnt vmcnt(0)
367; SI-NEXT:    s_mov_b32 s8, s0
368; SI-NEXT:    s_mov_b32 s9, s1
369; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
370; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
371; SI-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
372; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
373; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
374; SI-NEXT:    s_endpgm
375;
376; VI-LABEL: fcmp_f16_le:
377; VI:       ; %bb.0: ; %entry
378; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
379; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
380; VI-NEXT:    s_mov_b32 s7, 0xf000
381; VI-NEXT:    s_mov_b32 s6, -1
382; VI-NEXT:    s_mov_b32 s14, s6
383; VI-NEXT:    s_waitcnt lgkmcnt(0)
384; VI-NEXT:    s_mov_b32 s12, s2
385; VI-NEXT:    s_mov_b32 s13, s3
386; VI-NEXT:    s_mov_b32 s15, s7
387; VI-NEXT:    s_mov_b32 s10, s6
388; VI-NEXT:    s_mov_b32 s11, s7
389; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
390; VI-NEXT:    s_waitcnt vmcnt(0)
391; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
392; VI-NEXT:    s_waitcnt vmcnt(0)
393; VI-NEXT:    s_mov_b32 s4, s0
394; VI-NEXT:    s_mov_b32 s5, s1
395; VI-NEXT:    v_cmp_le_f16_e32 vcc, v0, v1
396; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
397; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
398; VI-NEXT:    s_endpgm
399;
400; GFX11-LABEL: fcmp_f16_le:
401; GFX11:       ; %bb.0: ; %entry
402; GFX11-NEXT:    s_clause 0x1
403; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
404; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
405; GFX11-NEXT:    s_mov_b32 s10, -1
406; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
407; GFX11-NEXT:    s_mov_b32 s14, s10
408; GFX11-NEXT:    s_mov_b32 s15, s11
409; GFX11-NEXT:    s_mov_b32 s6, s10
410; GFX11-NEXT:    s_mov_b32 s7, s11
411; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
412; GFX11-NEXT:    s_mov_b32 s12, s2
413; GFX11-NEXT:    s_mov_b32 s13, s3
414; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
415; GFX11-NEXT:    s_waitcnt vmcnt(0)
416; GFX11-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
417; GFX11-NEXT:    s_waitcnt vmcnt(0)
418; GFX11-NEXT:    s_mov_b32 s8, s0
419; GFX11-NEXT:    s_mov_b32 s9, s1
420; GFX11-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0, v1
421; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
422; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
423; GFX11-NEXT:    s_endpgm
424;
425; GFX12-LABEL: fcmp_f16_le:
426; GFX12:       ; %bb.0: ; %entry
427; GFX12-NEXT:    s_clause 0x1
428; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
429; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
430; GFX12-NEXT:    s_mov_b32 s10, -1
431; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
432; GFX12-NEXT:    s_mov_b32 s14, s10
433; GFX12-NEXT:    s_mov_b32 s15, s11
434; GFX12-NEXT:    s_mov_b32 s6, s10
435; GFX12-NEXT:    s_mov_b32 s7, s11
436; GFX12-NEXT:    s_wait_kmcnt 0x0
437; GFX12-NEXT:    s_mov_b32 s12, s2
438; GFX12-NEXT:    s_mov_b32 s13, s3
439; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
440; GFX12-NEXT:    s_wait_loadcnt 0x0
441; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
442; GFX12-NEXT:    s_wait_loadcnt 0x0
443; GFX12-NEXT:    s_mov_b32 s8, s0
444; GFX12-NEXT:    s_mov_b32 s9, s1
445; GFX12-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0, v1
446; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
447; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
448; GFX12-NEXT:    s_endpgm
449    ptr addrspace(1) %r,
450    ptr addrspace(1) %a,
451    ptr addrspace(1) %b) {
452entry:
453  %a.val = load volatile half, ptr addrspace(1) %a
454  %b.val = load volatile half, ptr addrspace(1) %b
455  %r.val = fcmp ole half %a.val, %b.val
456  %r.val.sext = sext i1 %r.val to i32
457  store i32 %r.val.sext, ptr addrspace(1) %r
458  ret void
459}
460
461define amdgpu_kernel void @fcmp_f16_gt(
462; SI-LABEL: fcmp_f16_gt:
463; SI:       ; %bb.0: ; %entry
464; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
465; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
466; SI-NEXT:    s_mov_b32 s11, 0xf000
467; SI-NEXT:    s_mov_b32 s10, -1
468; SI-NEXT:    s_mov_b32 s14, s10
469; SI-NEXT:    s_mov_b32 s15, s11
470; SI-NEXT:    s_mov_b32 s6, s10
471; SI-NEXT:    s_mov_b32 s7, s11
472; SI-NEXT:    s_waitcnt lgkmcnt(0)
473; SI-NEXT:    s_mov_b32 s12, s2
474; SI-NEXT:    s_mov_b32 s13, s3
475; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
476; SI-NEXT:    s_waitcnt vmcnt(0)
477; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 glc
478; SI-NEXT:    s_waitcnt vmcnt(0)
479; SI-NEXT:    s_mov_b32 s8, s0
480; SI-NEXT:    s_mov_b32 s9, s1
481; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
482; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
483; SI-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
484; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
485; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
486; SI-NEXT:    s_endpgm
487;
488; VI-LABEL: fcmp_f16_gt:
489; VI:       ; %bb.0: ; %entry
490; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
491; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
492; VI-NEXT:    s_mov_b32 s7, 0xf000
493; VI-NEXT:    s_mov_b32 s6, -1
494; VI-NEXT:    s_mov_b32 s14, s6
495; VI-NEXT:    s_waitcnt lgkmcnt(0)
496; VI-NEXT:    s_mov_b32 s12, s2
497; VI-NEXT:    s_mov_b32 s13, s3
498; VI-NEXT:    s_mov_b32 s15, s7
499; VI-NEXT:    s_mov_b32 s10, s6
500; VI-NEXT:    s_mov_b32 s11, s7
501; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
502; VI-NEXT:    s_waitcnt vmcnt(0)
503; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
504; VI-NEXT:    s_waitcnt vmcnt(0)
505; VI-NEXT:    s_mov_b32 s4, s0
506; VI-NEXT:    s_mov_b32 s5, s1
507; VI-NEXT:    v_cmp_gt_f16_e32 vcc, v0, v1
508; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
509; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
510; VI-NEXT:    s_endpgm
511;
512; GFX11-LABEL: fcmp_f16_gt:
513; GFX11:       ; %bb.0: ; %entry
514; GFX11-NEXT:    s_clause 0x1
515; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
516; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
517; GFX11-NEXT:    s_mov_b32 s10, -1
518; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
519; GFX11-NEXT:    s_mov_b32 s14, s10
520; GFX11-NEXT:    s_mov_b32 s15, s11
521; GFX11-NEXT:    s_mov_b32 s6, s10
522; GFX11-NEXT:    s_mov_b32 s7, s11
523; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
524; GFX11-NEXT:    s_mov_b32 s12, s2
525; GFX11-NEXT:    s_mov_b32 s13, s3
526; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
527; GFX11-NEXT:    s_waitcnt vmcnt(0)
528; GFX11-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
529; GFX11-NEXT:    s_waitcnt vmcnt(0)
530; GFX11-NEXT:    s_mov_b32 s8, s0
531; GFX11-NEXT:    s_mov_b32 s9, s1
532; GFX11-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0, v1
533; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
534; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
535; GFX11-NEXT:    s_endpgm
536;
537; GFX12-LABEL: fcmp_f16_gt:
538; GFX12:       ; %bb.0: ; %entry
539; GFX12-NEXT:    s_clause 0x1
540; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
541; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
542; GFX12-NEXT:    s_mov_b32 s10, -1
543; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
544; GFX12-NEXT:    s_mov_b32 s14, s10
545; GFX12-NEXT:    s_mov_b32 s15, s11
546; GFX12-NEXT:    s_mov_b32 s6, s10
547; GFX12-NEXT:    s_mov_b32 s7, s11
548; GFX12-NEXT:    s_wait_kmcnt 0x0
549; GFX12-NEXT:    s_mov_b32 s12, s2
550; GFX12-NEXT:    s_mov_b32 s13, s3
551; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
552; GFX12-NEXT:    s_wait_loadcnt 0x0
553; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
554; GFX12-NEXT:    s_wait_loadcnt 0x0
555; GFX12-NEXT:    s_mov_b32 s8, s0
556; GFX12-NEXT:    s_mov_b32 s9, s1
557; GFX12-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0, v1
558; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
559; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
560; GFX12-NEXT:    s_endpgm
561    ptr addrspace(1) %r,
562    ptr addrspace(1) %a,
563    ptr addrspace(1) %b) {
564entry:
565  %a.val = load volatile half, ptr addrspace(1) %a
566  %b.val = load volatile half, ptr addrspace(1) %b
567  %r.val = fcmp ogt half %a.val, %b.val
568  %r.val.sext = sext i1 %r.val to i32
569  store i32 %r.val.sext, ptr addrspace(1) %r
570  ret void
571}
572
573define amdgpu_kernel void @fcmp_f16_lg(
574; SI-LABEL: fcmp_f16_lg:
575; SI:       ; %bb.0: ; %entry
576; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
577; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
578; SI-NEXT:    s_mov_b32 s11, 0xf000
579; SI-NEXT:    s_mov_b32 s10, -1
580; SI-NEXT:    s_mov_b32 s14, s10
581; SI-NEXT:    s_mov_b32 s15, s11
582; SI-NEXT:    s_mov_b32 s6, s10
583; SI-NEXT:    s_mov_b32 s7, s11
584; SI-NEXT:    s_waitcnt lgkmcnt(0)
585; SI-NEXT:    s_mov_b32 s12, s2
586; SI-NEXT:    s_mov_b32 s13, s3
587; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
588; SI-NEXT:    s_waitcnt vmcnt(0)
589; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 glc
590; SI-NEXT:    s_waitcnt vmcnt(0)
591; SI-NEXT:    s_mov_b32 s8, s0
592; SI-NEXT:    s_mov_b32 s9, s1
593; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
594; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
595; SI-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
596; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
597; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
598; SI-NEXT:    s_endpgm
599;
600; VI-LABEL: fcmp_f16_lg:
601; VI:       ; %bb.0: ; %entry
602; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
603; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
604; VI-NEXT:    s_mov_b32 s7, 0xf000
605; VI-NEXT:    s_mov_b32 s6, -1
606; VI-NEXT:    s_mov_b32 s14, s6
607; VI-NEXT:    s_waitcnt lgkmcnt(0)
608; VI-NEXT:    s_mov_b32 s12, s2
609; VI-NEXT:    s_mov_b32 s13, s3
610; VI-NEXT:    s_mov_b32 s15, s7
611; VI-NEXT:    s_mov_b32 s10, s6
612; VI-NEXT:    s_mov_b32 s11, s7
613; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
614; VI-NEXT:    s_waitcnt vmcnt(0)
615; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
616; VI-NEXT:    s_waitcnt vmcnt(0)
617; VI-NEXT:    s_mov_b32 s4, s0
618; VI-NEXT:    s_mov_b32 s5, s1
619; VI-NEXT:    v_cmp_lg_f16_e32 vcc, v0, v1
620; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
621; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
622; VI-NEXT:    s_endpgm
623;
624; GFX11-LABEL: fcmp_f16_lg:
625; GFX11:       ; %bb.0: ; %entry
626; GFX11-NEXT:    s_clause 0x1
627; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
628; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
629; GFX11-NEXT:    s_mov_b32 s10, -1
630; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
631; GFX11-NEXT:    s_mov_b32 s14, s10
632; GFX11-NEXT:    s_mov_b32 s15, s11
633; GFX11-NEXT:    s_mov_b32 s6, s10
634; GFX11-NEXT:    s_mov_b32 s7, s11
635; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
636; GFX11-NEXT:    s_mov_b32 s12, s2
637; GFX11-NEXT:    s_mov_b32 s13, s3
638; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
639; GFX11-NEXT:    s_waitcnt vmcnt(0)
640; GFX11-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
641; GFX11-NEXT:    s_waitcnt vmcnt(0)
642; GFX11-NEXT:    s_mov_b32 s8, s0
643; GFX11-NEXT:    s_mov_b32 s9, s1
644; GFX11-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v0, v1
645; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
646; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
647; GFX11-NEXT:    s_endpgm
648;
649; GFX12-LABEL: fcmp_f16_lg:
650; GFX12:       ; %bb.0: ; %entry
651; GFX12-NEXT:    s_clause 0x1
652; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
653; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
654; GFX12-NEXT:    s_mov_b32 s10, -1
655; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
656; GFX12-NEXT:    s_mov_b32 s14, s10
657; GFX12-NEXT:    s_mov_b32 s15, s11
658; GFX12-NEXT:    s_mov_b32 s6, s10
659; GFX12-NEXT:    s_mov_b32 s7, s11
660; GFX12-NEXT:    s_wait_kmcnt 0x0
661; GFX12-NEXT:    s_mov_b32 s12, s2
662; GFX12-NEXT:    s_mov_b32 s13, s3
663; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
664; GFX12-NEXT:    s_wait_loadcnt 0x0
665; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
666; GFX12-NEXT:    s_wait_loadcnt 0x0
667; GFX12-NEXT:    s_mov_b32 s8, s0
668; GFX12-NEXT:    s_mov_b32 s9, s1
669; GFX12-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v0, v1
670; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
671; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
672; GFX12-NEXT:    s_endpgm
673    ptr addrspace(1) %r,
674    ptr addrspace(1) %a,
675    ptr addrspace(1) %b) {
676entry:
677  %a.val = load volatile half, ptr addrspace(1) %a
678  %b.val = load volatile half, ptr addrspace(1) %b
679  %r.val = fcmp one half %a.val, %b.val
680  %r.val.sext = sext i1 %r.val to i32
681  store i32 %r.val.sext, ptr addrspace(1) %r
682  ret void
683}
684
685define amdgpu_kernel void @fcmp_f16_ge(
686; SI-LABEL: fcmp_f16_ge:
687; SI:       ; %bb.0: ; %entry
688; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
689; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
690; SI-NEXT:    s_mov_b32 s11, 0xf000
691; SI-NEXT:    s_mov_b32 s10, -1
692; SI-NEXT:    s_mov_b32 s14, s10
693; SI-NEXT:    s_mov_b32 s15, s11
694; SI-NEXT:    s_mov_b32 s6, s10
695; SI-NEXT:    s_mov_b32 s7, s11
696; SI-NEXT:    s_waitcnt lgkmcnt(0)
697; SI-NEXT:    s_mov_b32 s12, s2
698; SI-NEXT:    s_mov_b32 s13, s3
699; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
700; SI-NEXT:    s_waitcnt vmcnt(0)
701; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 glc
702; SI-NEXT:    s_waitcnt vmcnt(0)
703; SI-NEXT:    s_mov_b32 s8, s0
704; SI-NEXT:    s_mov_b32 s9, s1
705; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
706; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
707; SI-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
708; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
709; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
710; SI-NEXT:    s_endpgm
711;
712; VI-LABEL: fcmp_f16_ge:
713; VI:       ; %bb.0: ; %entry
714; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
715; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
716; VI-NEXT:    s_mov_b32 s7, 0xf000
717; VI-NEXT:    s_mov_b32 s6, -1
718; VI-NEXT:    s_mov_b32 s14, s6
719; VI-NEXT:    s_waitcnt lgkmcnt(0)
720; VI-NEXT:    s_mov_b32 s12, s2
721; VI-NEXT:    s_mov_b32 s13, s3
722; VI-NEXT:    s_mov_b32 s15, s7
723; VI-NEXT:    s_mov_b32 s10, s6
724; VI-NEXT:    s_mov_b32 s11, s7
725; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
726; VI-NEXT:    s_waitcnt vmcnt(0)
727; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
728; VI-NEXT:    s_waitcnt vmcnt(0)
729; VI-NEXT:    s_mov_b32 s4, s0
730; VI-NEXT:    s_mov_b32 s5, s1
731; VI-NEXT:    v_cmp_ge_f16_e32 vcc, v0, v1
732; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
733; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
734; VI-NEXT:    s_endpgm
735;
736; GFX11-LABEL: fcmp_f16_ge:
737; GFX11:       ; %bb.0: ; %entry
738; GFX11-NEXT:    s_clause 0x1
739; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
740; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
741; GFX11-NEXT:    s_mov_b32 s10, -1
742; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
743; GFX11-NEXT:    s_mov_b32 s14, s10
744; GFX11-NEXT:    s_mov_b32 s15, s11
745; GFX11-NEXT:    s_mov_b32 s6, s10
746; GFX11-NEXT:    s_mov_b32 s7, s11
747; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
748; GFX11-NEXT:    s_mov_b32 s12, s2
749; GFX11-NEXT:    s_mov_b32 s13, s3
750; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
751; GFX11-NEXT:    s_waitcnt vmcnt(0)
752; GFX11-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
753; GFX11-NEXT:    s_waitcnt vmcnt(0)
754; GFX11-NEXT:    s_mov_b32 s8, s0
755; GFX11-NEXT:    s_mov_b32 s9, s1
756; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v1
757; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
758; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
759; GFX11-NEXT:    s_endpgm
760;
761; GFX12-LABEL: fcmp_f16_ge:
762; GFX12:       ; %bb.0: ; %entry
763; GFX12-NEXT:    s_clause 0x1
764; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
765; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
766; GFX12-NEXT:    s_mov_b32 s10, -1
767; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
768; GFX12-NEXT:    s_mov_b32 s14, s10
769; GFX12-NEXT:    s_mov_b32 s15, s11
770; GFX12-NEXT:    s_mov_b32 s6, s10
771; GFX12-NEXT:    s_mov_b32 s7, s11
772; GFX12-NEXT:    s_wait_kmcnt 0x0
773; GFX12-NEXT:    s_mov_b32 s12, s2
774; GFX12-NEXT:    s_mov_b32 s13, s3
775; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
776; GFX12-NEXT:    s_wait_loadcnt 0x0
777; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
778; GFX12-NEXT:    s_wait_loadcnt 0x0
779; GFX12-NEXT:    s_mov_b32 s8, s0
780; GFX12-NEXT:    s_mov_b32 s9, s1
781; GFX12-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v1
782; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
783; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
784; GFX12-NEXT:    s_endpgm
785    ptr addrspace(1) %r,
786    ptr addrspace(1) %a,
787    ptr addrspace(1) %b) {
788entry:
789  %a.val = load volatile half, ptr addrspace(1) %a
790  %b.val = load volatile half, ptr addrspace(1) %b
791  %r.val = fcmp oge half %a.val, %b.val
792  %r.val.sext = sext i1 %r.val to i32
793  store i32 %r.val.sext, ptr addrspace(1) %r
794  ret void
795}
796
797define amdgpu_kernel void @fcmp_f16_o(
798; SI-LABEL: fcmp_f16_o:
799; SI:       ; %bb.0: ; %entry
800; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
801; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
802; SI-NEXT:    s_mov_b32 s11, 0xf000
803; SI-NEXT:    s_mov_b32 s10, -1
804; SI-NEXT:    s_mov_b32 s14, s10
805; SI-NEXT:    s_mov_b32 s15, s11
806; SI-NEXT:    s_mov_b32 s6, s10
807; SI-NEXT:    s_mov_b32 s7, s11
808; SI-NEXT:    s_waitcnt lgkmcnt(0)
809; SI-NEXT:    s_mov_b32 s12, s2
810; SI-NEXT:    s_mov_b32 s13, s3
811; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
812; SI-NEXT:    s_waitcnt vmcnt(0)
813; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 glc
814; SI-NEXT:    s_waitcnt vmcnt(0)
815; SI-NEXT:    s_mov_b32 s8, s0
816; SI-NEXT:    s_mov_b32 s9, s1
817; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
818; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
819; SI-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
820; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
821; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
822; SI-NEXT:    s_endpgm
823;
824; VI-LABEL: fcmp_f16_o:
825; VI:       ; %bb.0: ; %entry
826; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
827; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
828; VI-NEXT:    s_mov_b32 s7, 0xf000
829; VI-NEXT:    s_mov_b32 s6, -1
830; VI-NEXT:    s_mov_b32 s14, s6
831; VI-NEXT:    s_waitcnt lgkmcnt(0)
832; VI-NEXT:    s_mov_b32 s12, s2
833; VI-NEXT:    s_mov_b32 s13, s3
834; VI-NEXT:    s_mov_b32 s15, s7
835; VI-NEXT:    s_mov_b32 s10, s6
836; VI-NEXT:    s_mov_b32 s11, s7
837; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
838; VI-NEXT:    s_waitcnt vmcnt(0)
839; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
840; VI-NEXT:    s_waitcnt vmcnt(0)
841; VI-NEXT:    s_mov_b32 s4, s0
842; VI-NEXT:    s_mov_b32 s5, s1
843; VI-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
844; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
845; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
846; VI-NEXT:    s_endpgm
847;
848; GFX11-LABEL: fcmp_f16_o:
849; GFX11:       ; %bb.0: ; %entry
850; GFX11-NEXT:    s_clause 0x1
851; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
852; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
853; GFX11-NEXT:    s_mov_b32 s10, -1
854; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
855; GFX11-NEXT:    s_mov_b32 s14, s10
856; GFX11-NEXT:    s_mov_b32 s15, s11
857; GFX11-NEXT:    s_mov_b32 s6, s10
858; GFX11-NEXT:    s_mov_b32 s7, s11
859; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
860; GFX11-NEXT:    s_mov_b32 s12, s2
861; GFX11-NEXT:    s_mov_b32 s13, s3
862; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
863; GFX11-NEXT:    s_waitcnt vmcnt(0)
864; GFX11-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
865; GFX11-NEXT:    s_waitcnt vmcnt(0)
866; GFX11-NEXT:    s_mov_b32 s8, s0
867; GFX11-NEXT:    s_mov_b32 s9, s1
868; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
869; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
870; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
871; GFX11-NEXT:    s_endpgm
872;
873; GFX12-LABEL: fcmp_f16_o:
874; GFX12:       ; %bb.0: ; %entry
875; GFX12-NEXT:    s_clause 0x1
876; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
877; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
878; GFX12-NEXT:    s_mov_b32 s10, -1
879; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
880; GFX12-NEXT:    s_mov_b32 s14, s10
881; GFX12-NEXT:    s_mov_b32 s15, s11
882; GFX12-NEXT:    s_mov_b32 s6, s10
883; GFX12-NEXT:    s_mov_b32 s7, s11
884; GFX12-NEXT:    s_wait_kmcnt 0x0
885; GFX12-NEXT:    s_mov_b32 s12, s2
886; GFX12-NEXT:    s_mov_b32 s13, s3
887; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
888; GFX12-NEXT:    s_wait_loadcnt 0x0
889; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
890; GFX12-NEXT:    s_wait_loadcnt 0x0
891; GFX12-NEXT:    s_mov_b32 s8, s0
892; GFX12-NEXT:    s_mov_b32 s9, s1
893; GFX12-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
894; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
895; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
896; GFX12-NEXT:    s_endpgm
897    ptr addrspace(1) %r,
898    ptr addrspace(1) %a,
899    ptr addrspace(1) %b) {
900entry:
901  %a.val = load volatile half, ptr addrspace(1) %a
902  %b.val = load volatile half, ptr addrspace(1) %b
903  %r.val = fcmp ord half %a.val, %b.val
904  %r.val.sext = sext i1 %r.val to i32
905  store i32 %r.val.sext, ptr addrspace(1) %r
906  ret void
907}
908
909define amdgpu_kernel void @fcmp_f16_u(
910; SI-LABEL: fcmp_f16_u:
911; SI:       ; %bb.0: ; %entry
912; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
913; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
914; SI-NEXT:    s_mov_b32 s11, 0xf000
915; SI-NEXT:    s_mov_b32 s10, -1
916; SI-NEXT:    s_mov_b32 s14, s10
917; SI-NEXT:    s_mov_b32 s15, s11
918; SI-NEXT:    s_mov_b32 s6, s10
919; SI-NEXT:    s_mov_b32 s7, s11
920; SI-NEXT:    s_waitcnt lgkmcnt(0)
921; SI-NEXT:    s_mov_b32 s12, s2
922; SI-NEXT:    s_mov_b32 s13, s3
923; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
924; SI-NEXT:    s_waitcnt vmcnt(0)
925; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 glc
926; SI-NEXT:    s_waitcnt vmcnt(0)
927; SI-NEXT:    s_mov_b32 s8, s0
928; SI-NEXT:    s_mov_b32 s9, s1
929; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
930; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
931; SI-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
932; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
933; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
934; SI-NEXT:    s_endpgm
935;
936; VI-LABEL: fcmp_f16_u:
937; VI:       ; %bb.0: ; %entry
938; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
939; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
940; VI-NEXT:    s_mov_b32 s7, 0xf000
941; VI-NEXT:    s_mov_b32 s6, -1
942; VI-NEXT:    s_mov_b32 s14, s6
943; VI-NEXT:    s_waitcnt lgkmcnt(0)
944; VI-NEXT:    s_mov_b32 s12, s2
945; VI-NEXT:    s_mov_b32 s13, s3
946; VI-NEXT:    s_mov_b32 s15, s7
947; VI-NEXT:    s_mov_b32 s10, s6
948; VI-NEXT:    s_mov_b32 s11, s7
949; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
950; VI-NEXT:    s_waitcnt vmcnt(0)
951; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
952; VI-NEXT:    s_waitcnt vmcnt(0)
953; VI-NEXT:    s_mov_b32 s4, s0
954; VI-NEXT:    s_mov_b32 s5, s1
955; VI-NEXT:    v_cmp_u_f16_e32 vcc, v0, v1
956; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
957; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
958; VI-NEXT:    s_endpgm
959;
960; GFX11-LABEL: fcmp_f16_u:
961; GFX11:       ; %bb.0: ; %entry
962; GFX11-NEXT:    s_clause 0x1
963; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
964; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
965; GFX11-NEXT:    s_mov_b32 s10, -1
966; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
967; GFX11-NEXT:    s_mov_b32 s14, s10
968; GFX11-NEXT:    s_mov_b32 s15, s11
969; GFX11-NEXT:    s_mov_b32 s6, s10
970; GFX11-NEXT:    s_mov_b32 s7, s11
971; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
972; GFX11-NEXT:    s_mov_b32 s12, s2
973; GFX11-NEXT:    s_mov_b32 s13, s3
974; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
975; GFX11-NEXT:    s_waitcnt vmcnt(0)
976; GFX11-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
977; GFX11-NEXT:    s_waitcnt vmcnt(0)
978; GFX11-NEXT:    s_mov_b32 s8, s0
979; GFX11-NEXT:    s_mov_b32 s9, s1
980; GFX11-NEXT:    v_cmp_u_f16_e32 vcc_lo, v0, v1
981; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
982; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
983; GFX11-NEXT:    s_endpgm
984;
985; GFX12-LABEL: fcmp_f16_u:
986; GFX12:       ; %bb.0: ; %entry
987; GFX12-NEXT:    s_clause 0x1
988; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
989; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
990; GFX12-NEXT:    s_mov_b32 s10, -1
991; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
992; GFX12-NEXT:    s_mov_b32 s14, s10
993; GFX12-NEXT:    s_mov_b32 s15, s11
994; GFX12-NEXT:    s_mov_b32 s6, s10
995; GFX12-NEXT:    s_mov_b32 s7, s11
996; GFX12-NEXT:    s_wait_kmcnt 0x0
997; GFX12-NEXT:    s_mov_b32 s12, s2
998; GFX12-NEXT:    s_mov_b32 s13, s3
999; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
1000; GFX12-NEXT:    s_wait_loadcnt 0x0
1001; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
1002; GFX12-NEXT:    s_wait_loadcnt 0x0
1003; GFX12-NEXT:    s_mov_b32 s8, s0
1004; GFX12-NEXT:    s_mov_b32 s9, s1
1005; GFX12-NEXT:    v_cmp_u_f16_e32 vcc_lo, v0, v1
1006; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1007; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
1008; GFX12-NEXT:    s_endpgm
1009    ptr addrspace(1) %r,
1010    ptr addrspace(1) %a,
1011    ptr addrspace(1) %b) {
1012entry:
1013  %a.val = load volatile half, ptr addrspace(1) %a
1014  %b.val = load volatile half, ptr addrspace(1) %b
1015  %r.val = fcmp uno half %a.val, %b.val
1016  %r.val.sext = sext i1 %r.val to i32
1017  store i32 %r.val.sext, ptr addrspace(1) %r
1018  ret void
1019}
1020
1021define amdgpu_kernel void @fcmp_f16_nge(
1022; SI-LABEL: fcmp_f16_nge:
1023; SI:       ; %bb.0: ; %entry
1024; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1025; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1026; SI-NEXT:    s_mov_b32 s11, 0xf000
1027; SI-NEXT:    s_mov_b32 s10, -1
1028; SI-NEXT:    s_mov_b32 s14, s10
1029; SI-NEXT:    s_mov_b32 s15, s11
1030; SI-NEXT:    s_mov_b32 s6, s10
1031; SI-NEXT:    s_mov_b32 s7, s11
1032; SI-NEXT:    s_waitcnt lgkmcnt(0)
1033; SI-NEXT:    s_mov_b32 s12, s2
1034; SI-NEXT:    s_mov_b32 s13, s3
1035; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
1036; SI-NEXT:    s_waitcnt vmcnt(0)
1037; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 glc
1038; SI-NEXT:    s_waitcnt vmcnt(0)
1039; SI-NEXT:    s_mov_b32 s8, s0
1040; SI-NEXT:    s_mov_b32 s9, s1
1041; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1042; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1043; SI-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
1044; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
1045; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
1046; SI-NEXT:    s_endpgm
1047;
1048; VI-LABEL: fcmp_f16_nge:
1049; VI:       ; %bb.0: ; %entry
1050; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1051; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
1052; VI-NEXT:    s_mov_b32 s7, 0xf000
1053; VI-NEXT:    s_mov_b32 s6, -1
1054; VI-NEXT:    s_mov_b32 s14, s6
1055; VI-NEXT:    s_waitcnt lgkmcnt(0)
1056; VI-NEXT:    s_mov_b32 s12, s2
1057; VI-NEXT:    s_mov_b32 s13, s3
1058; VI-NEXT:    s_mov_b32 s15, s7
1059; VI-NEXT:    s_mov_b32 s10, s6
1060; VI-NEXT:    s_mov_b32 s11, s7
1061; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
1062; VI-NEXT:    s_waitcnt vmcnt(0)
1063; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
1064; VI-NEXT:    s_waitcnt vmcnt(0)
1065; VI-NEXT:    s_mov_b32 s4, s0
1066; VI-NEXT:    s_mov_b32 s5, s1
1067; VI-NEXT:    v_cmp_nge_f16_e32 vcc, v0, v1
1068; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
1069; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1070; VI-NEXT:    s_endpgm
1071;
1072; GFX11-LABEL: fcmp_f16_nge:
1073; GFX11:       ; %bb.0: ; %entry
1074; GFX11-NEXT:    s_clause 0x1
1075; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1076; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1077; GFX11-NEXT:    s_mov_b32 s10, -1
1078; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
1079; GFX11-NEXT:    s_mov_b32 s14, s10
1080; GFX11-NEXT:    s_mov_b32 s15, s11
1081; GFX11-NEXT:    s_mov_b32 s6, s10
1082; GFX11-NEXT:    s_mov_b32 s7, s11
1083; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1084; GFX11-NEXT:    s_mov_b32 s12, s2
1085; GFX11-NEXT:    s_mov_b32 s13, s3
1086; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1087; GFX11-NEXT:    s_waitcnt vmcnt(0)
1088; GFX11-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
1089; GFX11-NEXT:    s_waitcnt vmcnt(0)
1090; GFX11-NEXT:    s_mov_b32 s8, s0
1091; GFX11-NEXT:    s_mov_b32 s9, s1
1092; GFX11-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v0, v1
1093; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1094; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
1095; GFX11-NEXT:    s_endpgm
1096;
1097; GFX12-LABEL: fcmp_f16_nge:
1098; GFX12:       ; %bb.0: ; %entry
1099; GFX12-NEXT:    s_clause 0x1
1100; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1101; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1102; GFX12-NEXT:    s_mov_b32 s10, -1
1103; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
1104; GFX12-NEXT:    s_mov_b32 s14, s10
1105; GFX12-NEXT:    s_mov_b32 s15, s11
1106; GFX12-NEXT:    s_mov_b32 s6, s10
1107; GFX12-NEXT:    s_mov_b32 s7, s11
1108; GFX12-NEXT:    s_wait_kmcnt 0x0
1109; GFX12-NEXT:    s_mov_b32 s12, s2
1110; GFX12-NEXT:    s_mov_b32 s13, s3
1111; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
1112; GFX12-NEXT:    s_wait_loadcnt 0x0
1113; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
1114; GFX12-NEXT:    s_wait_loadcnt 0x0
1115; GFX12-NEXT:    s_mov_b32 s8, s0
1116; GFX12-NEXT:    s_mov_b32 s9, s1
1117; GFX12-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v0, v1
1118; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1119; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
1120; GFX12-NEXT:    s_endpgm
1121    ptr addrspace(1) %r,
1122    ptr addrspace(1) %a,
1123    ptr addrspace(1) %b) {
1124entry:
1125  %a.val = load volatile half, ptr addrspace(1) %a
1126  %b.val = load volatile half, ptr addrspace(1) %b
1127  %r.val = fcmp ult half %a.val, %b.val
1128  %r.val.sext = sext i1 %r.val to i32
1129  store i32 %r.val.sext, ptr addrspace(1) %r
1130  ret void
1131}
1132
1133define amdgpu_kernel void @fcmp_f16_nlg(
1134; SI-LABEL: fcmp_f16_nlg:
1135; SI:       ; %bb.0: ; %entry
1136; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1137; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1138; SI-NEXT:    s_mov_b32 s11, 0xf000
1139; SI-NEXT:    s_mov_b32 s10, -1
1140; SI-NEXT:    s_mov_b32 s14, s10
1141; SI-NEXT:    s_mov_b32 s15, s11
1142; SI-NEXT:    s_mov_b32 s6, s10
1143; SI-NEXT:    s_mov_b32 s7, s11
1144; SI-NEXT:    s_waitcnt lgkmcnt(0)
1145; SI-NEXT:    s_mov_b32 s12, s2
1146; SI-NEXT:    s_mov_b32 s13, s3
1147; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
1148; SI-NEXT:    s_waitcnt vmcnt(0)
1149; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 glc
1150; SI-NEXT:    s_waitcnt vmcnt(0)
1151; SI-NEXT:    s_mov_b32 s8, s0
1152; SI-NEXT:    s_mov_b32 s9, s1
1153; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1154; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1155; SI-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
1156; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
1157; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
1158; SI-NEXT:    s_endpgm
1159;
1160; VI-LABEL: fcmp_f16_nlg:
1161; VI:       ; %bb.0: ; %entry
1162; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1163; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
1164; VI-NEXT:    s_mov_b32 s7, 0xf000
1165; VI-NEXT:    s_mov_b32 s6, -1
1166; VI-NEXT:    s_mov_b32 s14, s6
1167; VI-NEXT:    s_waitcnt lgkmcnt(0)
1168; VI-NEXT:    s_mov_b32 s12, s2
1169; VI-NEXT:    s_mov_b32 s13, s3
1170; VI-NEXT:    s_mov_b32 s15, s7
1171; VI-NEXT:    s_mov_b32 s10, s6
1172; VI-NEXT:    s_mov_b32 s11, s7
1173; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
1174; VI-NEXT:    s_waitcnt vmcnt(0)
1175; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
1176; VI-NEXT:    s_waitcnt vmcnt(0)
1177; VI-NEXT:    s_mov_b32 s4, s0
1178; VI-NEXT:    s_mov_b32 s5, s1
1179; VI-NEXT:    v_cmp_nlg_f16_e32 vcc, v0, v1
1180; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
1181; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1182; VI-NEXT:    s_endpgm
1183;
1184; GFX11-LABEL: fcmp_f16_nlg:
1185; GFX11:       ; %bb.0: ; %entry
1186; GFX11-NEXT:    s_clause 0x1
1187; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1188; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1189; GFX11-NEXT:    s_mov_b32 s10, -1
1190; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
1191; GFX11-NEXT:    s_mov_b32 s14, s10
1192; GFX11-NEXT:    s_mov_b32 s15, s11
1193; GFX11-NEXT:    s_mov_b32 s6, s10
1194; GFX11-NEXT:    s_mov_b32 s7, s11
1195; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1196; GFX11-NEXT:    s_mov_b32 s12, s2
1197; GFX11-NEXT:    s_mov_b32 s13, s3
1198; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1199; GFX11-NEXT:    s_waitcnt vmcnt(0)
1200; GFX11-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
1201; GFX11-NEXT:    s_waitcnt vmcnt(0)
1202; GFX11-NEXT:    s_mov_b32 s8, s0
1203; GFX11-NEXT:    s_mov_b32 s9, s1
1204; GFX11-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v0, v1
1205; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1206; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
1207; GFX11-NEXT:    s_endpgm
1208;
1209; GFX12-LABEL: fcmp_f16_nlg:
1210; GFX12:       ; %bb.0: ; %entry
1211; GFX12-NEXT:    s_clause 0x1
1212; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1213; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1214; GFX12-NEXT:    s_mov_b32 s10, -1
1215; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
1216; GFX12-NEXT:    s_mov_b32 s14, s10
1217; GFX12-NEXT:    s_mov_b32 s15, s11
1218; GFX12-NEXT:    s_mov_b32 s6, s10
1219; GFX12-NEXT:    s_mov_b32 s7, s11
1220; GFX12-NEXT:    s_wait_kmcnt 0x0
1221; GFX12-NEXT:    s_mov_b32 s12, s2
1222; GFX12-NEXT:    s_mov_b32 s13, s3
1223; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
1224; GFX12-NEXT:    s_wait_loadcnt 0x0
1225; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
1226; GFX12-NEXT:    s_wait_loadcnt 0x0
1227; GFX12-NEXT:    s_mov_b32 s8, s0
1228; GFX12-NEXT:    s_mov_b32 s9, s1
1229; GFX12-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v0, v1
1230; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1231; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
1232; GFX12-NEXT:    s_endpgm
1233    ptr addrspace(1) %r,
1234    ptr addrspace(1) %a,
1235    ptr addrspace(1) %b) {
1236entry:
1237  %a.val = load volatile half, ptr addrspace(1) %a
1238  %b.val = load volatile half, ptr addrspace(1) %b
1239  %r.val = fcmp ueq half %a.val, %b.val
1240  %r.val.sext = sext i1 %r.val to i32
1241  store i32 %r.val.sext, ptr addrspace(1) %r
1242  ret void
1243}
1244
1245define amdgpu_kernel void @fcmp_f16_ngt(
1246; SI-LABEL: fcmp_f16_ngt:
1247; SI:       ; %bb.0: ; %entry
1248; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1249; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1250; SI-NEXT:    s_mov_b32 s11, 0xf000
1251; SI-NEXT:    s_mov_b32 s10, -1
1252; SI-NEXT:    s_mov_b32 s14, s10
1253; SI-NEXT:    s_mov_b32 s15, s11
1254; SI-NEXT:    s_mov_b32 s6, s10
1255; SI-NEXT:    s_mov_b32 s7, s11
1256; SI-NEXT:    s_waitcnt lgkmcnt(0)
1257; SI-NEXT:    s_mov_b32 s12, s2
1258; SI-NEXT:    s_mov_b32 s13, s3
1259; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
1260; SI-NEXT:    s_waitcnt vmcnt(0)
1261; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 glc
1262; SI-NEXT:    s_waitcnt vmcnt(0)
1263; SI-NEXT:    s_mov_b32 s8, s0
1264; SI-NEXT:    s_mov_b32 s9, s1
1265; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1266; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1267; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
1268; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
1269; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
1270; SI-NEXT:    s_endpgm
1271;
1272; VI-LABEL: fcmp_f16_ngt:
1273; VI:       ; %bb.0: ; %entry
1274; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1275; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
1276; VI-NEXT:    s_mov_b32 s7, 0xf000
1277; VI-NEXT:    s_mov_b32 s6, -1
1278; VI-NEXT:    s_mov_b32 s14, s6
1279; VI-NEXT:    s_waitcnt lgkmcnt(0)
1280; VI-NEXT:    s_mov_b32 s12, s2
1281; VI-NEXT:    s_mov_b32 s13, s3
1282; VI-NEXT:    s_mov_b32 s15, s7
1283; VI-NEXT:    s_mov_b32 s10, s6
1284; VI-NEXT:    s_mov_b32 s11, s7
1285; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
1286; VI-NEXT:    s_waitcnt vmcnt(0)
1287; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
1288; VI-NEXT:    s_waitcnt vmcnt(0)
1289; VI-NEXT:    s_mov_b32 s4, s0
1290; VI-NEXT:    s_mov_b32 s5, s1
1291; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
1292; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
1293; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1294; VI-NEXT:    s_endpgm
1295;
1296; GFX11-LABEL: fcmp_f16_ngt:
1297; GFX11:       ; %bb.0: ; %entry
1298; GFX11-NEXT:    s_clause 0x1
1299; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1300; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1301; GFX11-NEXT:    s_mov_b32 s10, -1
1302; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
1303; GFX11-NEXT:    s_mov_b32 s14, s10
1304; GFX11-NEXT:    s_mov_b32 s15, s11
1305; GFX11-NEXT:    s_mov_b32 s6, s10
1306; GFX11-NEXT:    s_mov_b32 s7, s11
1307; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1308; GFX11-NEXT:    s_mov_b32 s12, s2
1309; GFX11-NEXT:    s_mov_b32 s13, s3
1310; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1311; GFX11-NEXT:    s_waitcnt vmcnt(0)
1312; GFX11-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
1313; GFX11-NEXT:    s_waitcnt vmcnt(0)
1314; GFX11-NEXT:    s_mov_b32 s8, s0
1315; GFX11-NEXT:    s_mov_b32 s9, s1
1316; GFX11-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
1317; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1318; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
1319; GFX11-NEXT:    s_endpgm
1320;
1321; GFX12-LABEL: fcmp_f16_ngt:
1322; GFX12:       ; %bb.0: ; %entry
1323; GFX12-NEXT:    s_clause 0x1
1324; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1325; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1326; GFX12-NEXT:    s_mov_b32 s10, -1
1327; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
1328; GFX12-NEXT:    s_mov_b32 s14, s10
1329; GFX12-NEXT:    s_mov_b32 s15, s11
1330; GFX12-NEXT:    s_mov_b32 s6, s10
1331; GFX12-NEXT:    s_mov_b32 s7, s11
1332; GFX12-NEXT:    s_wait_kmcnt 0x0
1333; GFX12-NEXT:    s_mov_b32 s12, s2
1334; GFX12-NEXT:    s_mov_b32 s13, s3
1335; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
1336; GFX12-NEXT:    s_wait_loadcnt 0x0
1337; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
1338; GFX12-NEXT:    s_wait_loadcnt 0x0
1339; GFX12-NEXT:    s_mov_b32 s8, s0
1340; GFX12-NEXT:    s_mov_b32 s9, s1
1341; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
1342; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1343; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
1344; GFX12-NEXT:    s_endpgm
1345    ptr addrspace(1) %r,
1346    ptr addrspace(1) %a,
1347    ptr addrspace(1) %b) {
1348entry:
1349  %a.val = load volatile half, ptr addrspace(1) %a
1350  %b.val = load volatile half, ptr addrspace(1) %b
1351  %r.val = fcmp ule half %a.val, %b.val
1352  %r.val.sext = sext i1 %r.val to i32
1353  store i32 %r.val.sext, ptr addrspace(1) %r
1354  ret void
1355}
1356
1357define amdgpu_kernel void @fcmp_f16_nle(
1358; SI-LABEL: fcmp_f16_nle:
1359; SI:       ; %bb.0: ; %entry
1360; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1361; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1362; SI-NEXT:    s_mov_b32 s11, 0xf000
1363; SI-NEXT:    s_mov_b32 s10, -1
1364; SI-NEXT:    s_mov_b32 s14, s10
1365; SI-NEXT:    s_mov_b32 s15, s11
1366; SI-NEXT:    s_mov_b32 s6, s10
1367; SI-NEXT:    s_mov_b32 s7, s11
1368; SI-NEXT:    s_waitcnt lgkmcnt(0)
1369; SI-NEXT:    s_mov_b32 s12, s2
1370; SI-NEXT:    s_mov_b32 s13, s3
1371; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
1372; SI-NEXT:    s_waitcnt vmcnt(0)
1373; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 glc
1374; SI-NEXT:    s_waitcnt vmcnt(0)
1375; SI-NEXT:    s_mov_b32 s8, s0
1376; SI-NEXT:    s_mov_b32 s9, s1
1377; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1378; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1379; SI-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
1380; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
1381; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
1382; SI-NEXT:    s_endpgm
1383;
1384; VI-LABEL: fcmp_f16_nle:
1385; VI:       ; %bb.0: ; %entry
1386; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1387; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
1388; VI-NEXT:    s_mov_b32 s7, 0xf000
1389; VI-NEXT:    s_mov_b32 s6, -1
1390; VI-NEXT:    s_mov_b32 s14, s6
1391; VI-NEXT:    s_waitcnt lgkmcnt(0)
1392; VI-NEXT:    s_mov_b32 s12, s2
1393; VI-NEXT:    s_mov_b32 s13, s3
1394; VI-NEXT:    s_mov_b32 s15, s7
1395; VI-NEXT:    s_mov_b32 s10, s6
1396; VI-NEXT:    s_mov_b32 s11, s7
1397; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
1398; VI-NEXT:    s_waitcnt vmcnt(0)
1399; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
1400; VI-NEXT:    s_waitcnt vmcnt(0)
1401; VI-NEXT:    s_mov_b32 s4, s0
1402; VI-NEXT:    s_mov_b32 s5, s1
1403; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
1404; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
1405; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1406; VI-NEXT:    s_endpgm
1407;
1408; GFX11-LABEL: fcmp_f16_nle:
1409; GFX11:       ; %bb.0: ; %entry
1410; GFX11-NEXT:    s_clause 0x1
1411; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1412; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1413; GFX11-NEXT:    s_mov_b32 s10, -1
1414; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
1415; GFX11-NEXT:    s_mov_b32 s14, s10
1416; GFX11-NEXT:    s_mov_b32 s15, s11
1417; GFX11-NEXT:    s_mov_b32 s6, s10
1418; GFX11-NEXT:    s_mov_b32 s7, s11
1419; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1420; GFX11-NEXT:    s_mov_b32 s12, s2
1421; GFX11-NEXT:    s_mov_b32 s13, s3
1422; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1423; GFX11-NEXT:    s_waitcnt vmcnt(0)
1424; GFX11-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
1425; GFX11-NEXT:    s_waitcnt vmcnt(0)
1426; GFX11-NEXT:    s_mov_b32 s8, s0
1427; GFX11-NEXT:    s_mov_b32 s9, s1
1428; GFX11-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v1
1429; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1430; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
1431; GFX11-NEXT:    s_endpgm
1432;
1433; GFX12-LABEL: fcmp_f16_nle:
1434; GFX12:       ; %bb.0: ; %entry
1435; GFX12-NEXT:    s_clause 0x1
1436; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1437; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1438; GFX12-NEXT:    s_mov_b32 s10, -1
1439; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
1440; GFX12-NEXT:    s_mov_b32 s14, s10
1441; GFX12-NEXT:    s_mov_b32 s15, s11
1442; GFX12-NEXT:    s_mov_b32 s6, s10
1443; GFX12-NEXT:    s_mov_b32 s7, s11
1444; GFX12-NEXT:    s_wait_kmcnt 0x0
1445; GFX12-NEXT:    s_mov_b32 s12, s2
1446; GFX12-NEXT:    s_mov_b32 s13, s3
1447; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
1448; GFX12-NEXT:    s_wait_loadcnt 0x0
1449; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
1450; GFX12-NEXT:    s_wait_loadcnt 0x0
1451; GFX12-NEXT:    s_mov_b32 s8, s0
1452; GFX12-NEXT:    s_mov_b32 s9, s1
1453; GFX12-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v1
1454; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1455; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
1456; GFX12-NEXT:    s_endpgm
1457    ptr addrspace(1) %r,
1458    ptr addrspace(1) %a,
1459    ptr addrspace(1) %b) {
1460entry:
1461  %a.val = load volatile half, ptr addrspace(1) %a
1462  %b.val = load volatile half, ptr addrspace(1) %b
1463  %r.val = fcmp ugt half %a.val, %b.val
1464  %r.val.sext = sext i1 %r.val to i32
1465  store i32 %r.val.sext, ptr addrspace(1) %r
1466  ret void
1467}
1468
1469define amdgpu_kernel void @fcmp_f16_neq(
1470; SI-LABEL: fcmp_f16_neq:
1471; SI:       ; %bb.0: ; %entry
1472; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1473; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1474; SI-NEXT:    s_mov_b32 s11, 0xf000
1475; SI-NEXT:    s_mov_b32 s10, -1
1476; SI-NEXT:    s_mov_b32 s14, s10
1477; SI-NEXT:    s_mov_b32 s15, s11
1478; SI-NEXT:    s_mov_b32 s6, s10
1479; SI-NEXT:    s_mov_b32 s7, s11
1480; SI-NEXT:    s_waitcnt lgkmcnt(0)
1481; SI-NEXT:    s_mov_b32 s12, s2
1482; SI-NEXT:    s_mov_b32 s13, s3
1483; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
1484; SI-NEXT:    s_waitcnt vmcnt(0)
1485; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 glc
1486; SI-NEXT:    s_waitcnt vmcnt(0)
1487; SI-NEXT:    s_mov_b32 s8, s0
1488; SI-NEXT:    s_mov_b32 s9, s1
1489; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1490; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1491; SI-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
1492; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
1493; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
1494; SI-NEXT:    s_endpgm
1495;
1496; VI-LABEL: fcmp_f16_neq:
1497; VI:       ; %bb.0: ; %entry
1498; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1499; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
1500; VI-NEXT:    s_mov_b32 s7, 0xf000
1501; VI-NEXT:    s_mov_b32 s6, -1
1502; VI-NEXT:    s_mov_b32 s14, s6
1503; VI-NEXT:    s_waitcnt lgkmcnt(0)
1504; VI-NEXT:    s_mov_b32 s12, s2
1505; VI-NEXT:    s_mov_b32 s13, s3
1506; VI-NEXT:    s_mov_b32 s15, s7
1507; VI-NEXT:    s_mov_b32 s10, s6
1508; VI-NEXT:    s_mov_b32 s11, s7
1509; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
1510; VI-NEXT:    s_waitcnt vmcnt(0)
1511; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
1512; VI-NEXT:    s_waitcnt vmcnt(0)
1513; VI-NEXT:    s_mov_b32 s4, s0
1514; VI-NEXT:    s_mov_b32 s5, s1
1515; VI-NEXT:    v_cmp_neq_f16_e32 vcc, v0, v1
1516; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
1517; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1518; VI-NEXT:    s_endpgm
1519;
1520; GFX11-LABEL: fcmp_f16_neq:
1521; GFX11:       ; %bb.0: ; %entry
1522; GFX11-NEXT:    s_clause 0x1
1523; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1524; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1525; GFX11-NEXT:    s_mov_b32 s10, -1
1526; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
1527; GFX11-NEXT:    s_mov_b32 s14, s10
1528; GFX11-NEXT:    s_mov_b32 s15, s11
1529; GFX11-NEXT:    s_mov_b32 s6, s10
1530; GFX11-NEXT:    s_mov_b32 s7, s11
1531; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1532; GFX11-NEXT:    s_mov_b32 s12, s2
1533; GFX11-NEXT:    s_mov_b32 s13, s3
1534; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1535; GFX11-NEXT:    s_waitcnt vmcnt(0)
1536; GFX11-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
1537; GFX11-NEXT:    s_waitcnt vmcnt(0)
1538; GFX11-NEXT:    s_mov_b32 s8, s0
1539; GFX11-NEXT:    s_mov_b32 s9, s1
1540; GFX11-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v0, v1
1541; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1542; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
1543; GFX11-NEXT:    s_endpgm
1544;
1545; GFX12-LABEL: fcmp_f16_neq:
1546; GFX12:       ; %bb.0: ; %entry
1547; GFX12-NEXT:    s_clause 0x1
1548; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1549; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1550; GFX12-NEXT:    s_mov_b32 s10, -1
1551; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
1552; GFX12-NEXT:    s_mov_b32 s14, s10
1553; GFX12-NEXT:    s_mov_b32 s15, s11
1554; GFX12-NEXT:    s_mov_b32 s6, s10
1555; GFX12-NEXT:    s_mov_b32 s7, s11
1556; GFX12-NEXT:    s_wait_kmcnt 0x0
1557; GFX12-NEXT:    s_mov_b32 s12, s2
1558; GFX12-NEXT:    s_mov_b32 s13, s3
1559; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
1560; GFX12-NEXT:    s_wait_loadcnt 0x0
1561; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
1562; GFX12-NEXT:    s_wait_loadcnt 0x0
1563; GFX12-NEXT:    s_mov_b32 s8, s0
1564; GFX12-NEXT:    s_mov_b32 s9, s1
1565; GFX12-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v0, v1
1566; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1567; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
1568; GFX12-NEXT:    s_endpgm
1569    ptr addrspace(1) %r,
1570    ptr addrspace(1) %a,
1571    ptr addrspace(1) %b) {
1572entry:
1573  %a.val = load volatile half, ptr addrspace(1) %a
1574  %b.val = load volatile half, ptr addrspace(1) %b
1575  %r.val = fcmp une half %a.val, %b.val
1576  %r.val.sext = sext i1 %r.val to i32
1577  store i32 %r.val.sext, ptr addrspace(1) %r
1578  ret void
1579}
1580
1581define amdgpu_kernel void @fcmp_f16_nlt(
1582; SI-LABEL: fcmp_f16_nlt:
1583; SI:       ; %bb.0: ; %entry
1584; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1585; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1586; SI-NEXT:    s_mov_b32 s11, 0xf000
1587; SI-NEXT:    s_mov_b32 s10, -1
1588; SI-NEXT:    s_mov_b32 s14, s10
1589; SI-NEXT:    s_mov_b32 s15, s11
1590; SI-NEXT:    s_mov_b32 s6, s10
1591; SI-NEXT:    s_mov_b32 s7, s11
1592; SI-NEXT:    s_waitcnt lgkmcnt(0)
1593; SI-NEXT:    s_mov_b32 s12, s2
1594; SI-NEXT:    s_mov_b32 s13, s3
1595; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
1596; SI-NEXT:    s_waitcnt vmcnt(0)
1597; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 glc
1598; SI-NEXT:    s_waitcnt vmcnt(0)
1599; SI-NEXT:    s_mov_b32 s8, s0
1600; SI-NEXT:    s_mov_b32 s9, s1
1601; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1602; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1603; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
1604; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
1605; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
1606; SI-NEXT:    s_endpgm
1607;
1608; VI-LABEL: fcmp_f16_nlt:
1609; VI:       ; %bb.0: ; %entry
1610; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1611; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
1612; VI-NEXT:    s_mov_b32 s7, 0xf000
1613; VI-NEXT:    s_mov_b32 s6, -1
1614; VI-NEXT:    s_mov_b32 s14, s6
1615; VI-NEXT:    s_waitcnt lgkmcnt(0)
1616; VI-NEXT:    s_mov_b32 s12, s2
1617; VI-NEXT:    s_mov_b32 s13, s3
1618; VI-NEXT:    s_mov_b32 s15, s7
1619; VI-NEXT:    s_mov_b32 s10, s6
1620; VI-NEXT:    s_mov_b32 s11, s7
1621; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
1622; VI-NEXT:    s_waitcnt vmcnt(0)
1623; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
1624; VI-NEXT:    s_waitcnt vmcnt(0)
1625; VI-NEXT:    s_mov_b32 s4, s0
1626; VI-NEXT:    s_mov_b32 s5, s1
1627; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
1628; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
1629; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1630; VI-NEXT:    s_endpgm
1631;
1632; GFX11-LABEL: fcmp_f16_nlt:
1633; GFX11:       ; %bb.0: ; %entry
1634; GFX11-NEXT:    s_clause 0x1
1635; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1636; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1637; GFX11-NEXT:    s_mov_b32 s10, -1
1638; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
1639; GFX11-NEXT:    s_mov_b32 s14, s10
1640; GFX11-NEXT:    s_mov_b32 s15, s11
1641; GFX11-NEXT:    s_mov_b32 s6, s10
1642; GFX11-NEXT:    s_mov_b32 s7, s11
1643; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1644; GFX11-NEXT:    s_mov_b32 s12, s2
1645; GFX11-NEXT:    s_mov_b32 s13, s3
1646; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
1647; GFX11-NEXT:    s_waitcnt vmcnt(0)
1648; GFX11-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
1649; GFX11-NEXT:    s_waitcnt vmcnt(0)
1650; GFX11-NEXT:    s_mov_b32 s8, s0
1651; GFX11-NEXT:    s_mov_b32 s9, s1
1652; GFX11-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
1653; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1654; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
1655; GFX11-NEXT:    s_endpgm
1656;
1657; GFX12-LABEL: fcmp_f16_nlt:
1658; GFX12:       ; %bb.0: ; %entry
1659; GFX12-NEXT:    s_clause 0x1
1660; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1661; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1662; GFX12-NEXT:    s_mov_b32 s10, -1
1663; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
1664; GFX12-NEXT:    s_mov_b32 s14, s10
1665; GFX12-NEXT:    s_mov_b32 s15, s11
1666; GFX12-NEXT:    s_mov_b32 s6, s10
1667; GFX12-NEXT:    s_mov_b32 s7, s11
1668; GFX12-NEXT:    s_wait_kmcnt 0x0
1669; GFX12-NEXT:    s_mov_b32 s12, s2
1670; GFX12-NEXT:    s_mov_b32 s13, s3
1671; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
1672; GFX12-NEXT:    s_wait_loadcnt 0x0
1673; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
1674; GFX12-NEXT:    s_wait_loadcnt 0x0
1675; GFX12-NEXT:    s_mov_b32 s8, s0
1676; GFX12-NEXT:    s_mov_b32 s9, s1
1677; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
1678; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1679; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
1680; GFX12-NEXT:    s_endpgm
1681    ptr addrspace(1) %r,
1682    ptr addrspace(1) %a,
1683    ptr addrspace(1) %b) {
1684entry:
1685  %a.val = load volatile half, ptr addrspace(1) %a
1686  %b.val = load volatile half, ptr addrspace(1) %b
1687  %r.val = fcmp uge half %a.val, %b.val
1688  %r.val.sext = sext i1 %r.val to i32
1689  store i32 %r.val.sext, ptr addrspace(1) %r
1690  ret void
1691}
1692
1693define amdgpu_kernel void @fcmp_v2f16_lt(
1694; SI-LABEL: fcmp_v2f16_lt:
1695; SI:       ; %bb.0: ; %entry
1696; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1697; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1698; SI-NEXT:    s_mov_b32 s11, 0xf000
1699; SI-NEXT:    s_mov_b32 s10, -1
1700; SI-NEXT:    s_mov_b32 s14, s10
1701; SI-NEXT:    s_mov_b32 s15, s11
1702; SI-NEXT:    s_waitcnt lgkmcnt(0)
1703; SI-NEXT:    s_mov_b32 s12, s2
1704; SI-NEXT:    s_mov_b32 s13, s3
1705; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1706; SI-NEXT:    s_mov_b32 s6, s10
1707; SI-NEXT:    s_mov_b32 s7, s11
1708; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
1709; SI-NEXT:    s_mov_b32 s8, s0
1710; SI-NEXT:    s_mov_b32 s9, s1
1711; SI-NEXT:    s_waitcnt vmcnt(1)
1712; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
1713; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1714; SI-NEXT:    s_waitcnt vmcnt(0)
1715; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
1716; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1717; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
1718; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1719; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v3
1720; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
1721; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v1
1722; SI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
1723; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1724; SI-NEXT:    s_endpgm
1725;
1726; VI-LABEL: fcmp_v2f16_lt:
1727; VI:       ; %bb.0: ; %entry
1728; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1729; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
1730; VI-NEXT:    s_mov_b32 s7, 0xf000
1731; VI-NEXT:    s_mov_b32 s6, -1
1732; VI-NEXT:    s_mov_b32 s10, s6
1733; VI-NEXT:    s_mov_b32 s11, s7
1734; VI-NEXT:    s_waitcnt lgkmcnt(0)
1735; VI-NEXT:    s_mov_b32 s12, s2
1736; VI-NEXT:    s_mov_b32 s13, s3
1737; VI-NEXT:    s_mov_b32 s14, s6
1738; VI-NEXT:    s_mov_b32 s15, s7
1739; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1740; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
1741; VI-NEXT:    s_mov_b32 s4, s0
1742; VI-NEXT:    s_mov_b32 s5, s1
1743; VI-NEXT:    s_waitcnt vmcnt(1)
1744; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
1745; VI-NEXT:    s_waitcnt vmcnt(0)
1746; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
1747; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v1, v0
1748; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
1749; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v3, v2
1750; VI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
1751; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1752; VI-NEXT:    s_endpgm
1753;
1754; GFX11-LABEL: fcmp_v2f16_lt:
1755; GFX11:       ; %bb.0: ; %entry
1756; GFX11-NEXT:    s_clause 0x1
1757; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1758; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1759; GFX11-NEXT:    s_mov_b32 s10, -1
1760; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
1761; GFX11-NEXT:    s_mov_b32 s6, s10
1762; GFX11-NEXT:    s_mov_b32 s7, s11
1763; GFX11-NEXT:    s_mov_b32 s14, s10
1764; GFX11-NEXT:    s_mov_b32 s15, s11
1765; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1766; GFX11-NEXT:    s_mov_b32 s12, s2
1767; GFX11-NEXT:    s_mov_b32 s13, s3
1768; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
1769; GFX11-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
1770; GFX11-NEXT:    s_mov_b32 s8, s0
1771; GFX11-NEXT:    s_mov_b32 s9, s1
1772; GFX11-NEXT:    s_waitcnt vmcnt(1)
1773; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
1774; GFX11-NEXT:    s_waitcnt vmcnt(0)
1775; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
1776; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v1, v0
1777; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1778; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
1779; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v3, v2
1780; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
1781; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
1782; GFX11-NEXT:    s_endpgm
1783;
1784; GFX12-LABEL: fcmp_v2f16_lt:
1785; GFX12:       ; %bb.0: ; %entry
1786; GFX12-NEXT:    s_clause 0x1
1787; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1788; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1789; GFX12-NEXT:    s_mov_b32 s10, -1
1790; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
1791; GFX12-NEXT:    s_mov_b32 s6, s10
1792; GFX12-NEXT:    s_mov_b32 s7, s11
1793; GFX12-NEXT:    s_mov_b32 s14, s10
1794; GFX12-NEXT:    s_mov_b32 s15, s11
1795; GFX12-NEXT:    s_wait_kmcnt 0x0
1796; GFX12-NEXT:    s_mov_b32 s12, s2
1797; GFX12-NEXT:    s_mov_b32 s13, s3
1798; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
1799; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
1800; GFX12-NEXT:    s_mov_b32 s8, s0
1801; GFX12-NEXT:    s_mov_b32 s9, s1
1802; GFX12-NEXT:    s_wait_loadcnt 0x1
1803; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
1804; GFX12-NEXT:    s_wait_loadcnt 0x0
1805; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
1806; GFX12-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v1, v0
1807; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1808; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
1809; GFX12-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v3, v2
1810; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
1811; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
1812; GFX12-NEXT:    s_endpgm
1813    ptr addrspace(1) %r,
1814    ptr addrspace(1) %a,
1815    ptr addrspace(1) %b) {
1816entry:
1817  %a.val = load <2 x half>, ptr addrspace(1) %a
1818  %b.val = load <2 x half>, ptr addrspace(1) %b
1819  %r.val = fcmp olt <2 x half> %a.val, %b.val
1820  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
1821  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
1822  ret void
1823}
1824
1825
1826define amdgpu_kernel void @fcmp_v2f16_eq(
1827; SI-LABEL: fcmp_v2f16_eq:
1828; SI:       ; %bb.0: ; %entry
1829; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1830; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1831; SI-NEXT:    s_mov_b32 s11, 0xf000
1832; SI-NEXT:    s_mov_b32 s10, -1
1833; SI-NEXT:    s_mov_b32 s14, s10
1834; SI-NEXT:    s_mov_b32 s15, s11
1835; SI-NEXT:    s_waitcnt lgkmcnt(0)
1836; SI-NEXT:    s_mov_b32 s12, s2
1837; SI-NEXT:    s_mov_b32 s13, s3
1838; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1839; SI-NEXT:    s_mov_b32 s6, s10
1840; SI-NEXT:    s_mov_b32 s7, s11
1841; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
1842; SI-NEXT:    s_mov_b32 s8, s0
1843; SI-NEXT:    s_mov_b32 s9, s1
1844; SI-NEXT:    s_waitcnt vmcnt(1)
1845; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
1846; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1847; SI-NEXT:    s_waitcnt vmcnt(0)
1848; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
1849; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1850; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
1851; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1852; SI-NEXT:    v_cmp_eq_f32_e32 vcc, v2, v3
1853; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
1854; SI-NEXT:    v_cmp_eq_f32_e32 vcc, v4, v1
1855; SI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
1856; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1857; SI-NEXT:    s_endpgm
1858;
1859; VI-LABEL: fcmp_v2f16_eq:
1860; VI:       ; %bb.0: ; %entry
1861; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1862; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
1863; VI-NEXT:    s_mov_b32 s7, 0xf000
1864; VI-NEXT:    s_mov_b32 s6, -1
1865; VI-NEXT:    s_mov_b32 s10, s6
1866; VI-NEXT:    s_mov_b32 s11, s7
1867; VI-NEXT:    s_waitcnt lgkmcnt(0)
1868; VI-NEXT:    s_mov_b32 s12, s2
1869; VI-NEXT:    s_mov_b32 s13, s3
1870; VI-NEXT:    s_mov_b32 s14, s6
1871; VI-NEXT:    s_mov_b32 s15, s7
1872; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1873; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
1874; VI-NEXT:    s_mov_b32 s4, s0
1875; VI-NEXT:    s_mov_b32 s5, s1
1876; VI-NEXT:    s_waitcnt vmcnt(1)
1877; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
1878; VI-NEXT:    s_waitcnt vmcnt(0)
1879; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
1880; VI-NEXT:    v_cmp_eq_f16_e32 vcc, v1, v0
1881; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
1882; VI-NEXT:    v_cmp_eq_f16_e32 vcc, v3, v2
1883; VI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
1884; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1885; VI-NEXT:    s_endpgm
1886;
1887; GFX11-LABEL: fcmp_v2f16_eq:
1888; GFX11:       ; %bb.0: ; %entry
1889; GFX11-NEXT:    s_clause 0x1
1890; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1891; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1892; GFX11-NEXT:    s_mov_b32 s10, -1
1893; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
1894; GFX11-NEXT:    s_mov_b32 s6, s10
1895; GFX11-NEXT:    s_mov_b32 s7, s11
1896; GFX11-NEXT:    s_mov_b32 s14, s10
1897; GFX11-NEXT:    s_mov_b32 s15, s11
1898; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1899; GFX11-NEXT:    s_mov_b32 s12, s2
1900; GFX11-NEXT:    s_mov_b32 s13, s3
1901; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
1902; GFX11-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
1903; GFX11-NEXT:    s_mov_b32 s8, s0
1904; GFX11-NEXT:    s_mov_b32 s9, s1
1905; GFX11-NEXT:    s_waitcnt vmcnt(1)
1906; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
1907; GFX11-NEXT:    s_waitcnt vmcnt(0)
1908; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
1909; GFX11-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v1, v0
1910; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1911; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
1912; GFX11-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v3, v2
1913; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
1914; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
1915; GFX11-NEXT:    s_endpgm
1916;
1917; GFX12-LABEL: fcmp_v2f16_eq:
1918; GFX12:       ; %bb.0: ; %entry
1919; GFX12-NEXT:    s_clause 0x1
1920; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1921; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1922; GFX12-NEXT:    s_mov_b32 s10, -1
1923; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
1924; GFX12-NEXT:    s_mov_b32 s6, s10
1925; GFX12-NEXT:    s_mov_b32 s7, s11
1926; GFX12-NEXT:    s_mov_b32 s14, s10
1927; GFX12-NEXT:    s_mov_b32 s15, s11
1928; GFX12-NEXT:    s_wait_kmcnt 0x0
1929; GFX12-NEXT:    s_mov_b32 s12, s2
1930; GFX12-NEXT:    s_mov_b32 s13, s3
1931; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
1932; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
1933; GFX12-NEXT:    s_mov_b32 s8, s0
1934; GFX12-NEXT:    s_mov_b32 s9, s1
1935; GFX12-NEXT:    s_wait_loadcnt 0x1
1936; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
1937; GFX12-NEXT:    s_wait_loadcnt 0x0
1938; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
1939; GFX12-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v1, v0
1940; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1941; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
1942; GFX12-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v3, v2
1943; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
1944; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
1945; GFX12-NEXT:    s_endpgm
1946    ptr addrspace(1) %r,
1947    ptr addrspace(1) %a,
1948    ptr addrspace(1) %b) {
1949entry:
1950  %a.val = load <2 x half>, ptr addrspace(1) %a
1951  %b.val = load <2 x half>, ptr addrspace(1) %b
1952  %r.val = fcmp oeq <2 x half> %a.val, %b.val
1953  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
1954  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
1955  ret void
1956}
1957
1958define amdgpu_kernel void @fcmp_v2f16_le(
1959; SI-LABEL: fcmp_v2f16_le:
1960; SI:       ; %bb.0: ; %entry
1961; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1962; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1963; SI-NEXT:    s_mov_b32 s11, 0xf000
1964; SI-NEXT:    s_mov_b32 s10, -1
1965; SI-NEXT:    s_mov_b32 s14, s10
1966; SI-NEXT:    s_mov_b32 s15, s11
1967; SI-NEXT:    s_waitcnt lgkmcnt(0)
1968; SI-NEXT:    s_mov_b32 s12, s2
1969; SI-NEXT:    s_mov_b32 s13, s3
1970; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
1971; SI-NEXT:    s_mov_b32 s6, s10
1972; SI-NEXT:    s_mov_b32 s7, s11
1973; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
1974; SI-NEXT:    s_mov_b32 s8, s0
1975; SI-NEXT:    s_mov_b32 s9, s1
1976; SI-NEXT:    s_waitcnt vmcnt(1)
1977; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
1978; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1979; SI-NEXT:    s_waitcnt vmcnt(0)
1980; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
1981; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1982; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
1983; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1984; SI-NEXT:    v_cmp_le_f32_e32 vcc, v2, v3
1985; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
1986; SI-NEXT:    v_cmp_le_f32_e32 vcc, v4, v1
1987; SI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
1988; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1989; SI-NEXT:    s_endpgm
1990;
1991; VI-LABEL: fcmp_v2f16_le:
1992; VI:       ; %bb.0: ; %entry
1993; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1994; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
1995; VI-NEXT:    s_mov_b32 s7, 0xf000
1996; VI-NEXT:    s_mov_b32 s6, -1
1997; VI-NEXT:    s_mov_b32 s10, s6
1998; VI-NEXT:    s_mov_b32 s11, s7
1999; VI-NEXT:    s_waitcnt lgkmcnt(0)
2000; VI-NEXT:    s_mov_b32 s12, s2
2001; VI-NEXT:    s_mov_b32 s13, s3
2002; VI-NEXT:    s_mov_b32 s14, s6
2003; VI-NEXT:    s_mov_b32 s15, s7
2004; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
2005; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
2006; VI-NEXT:    s_mov_b32 s4, s0
2007; VI-NEXT:    s_mov_b32 s5, s1
2008; VI-NEXT:    s_waitcnt vmcnt(1)
2009; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2010; VI-NEXT:    s_waitcnt vmcnt(0)
2011; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2012; VI-NEXT:    v_cmp_le_f16_e32 vcc, v1, v0
2013; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
2014; VI-NEXT:    v_cmp_le_f16_e32 vcc, v3, v2
2015; VI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
2016; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2017; VI-NEXT:    s_endpgm
2018;
2019; GFX11-LABEL: fcmp_v2f16_le:
2020; GFX11:       ; %bb.0: ; %entry
2021; GFX11-NEXT:    s_clause 0x1
2022; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2023; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2024; GFX11-NEXT:    s_mov_b32 s10, -1
2025; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
2026; GFX11-NEXT:    s_mov_b32 s6, s10
2027; GFX11-NEXT:    s_mov_b32 s7, s11
2028; GFX11-NEXT:    s_mov_b32 s14, s10
2029; GFX11-NEXT:    s_mov_b32 s15, s11
2030; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2031; GFX11-NEXT:    s_mov_b32 s12, s2
2032; GFX11-NEXT:    s_mov_b32 s13, s3
2033; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
2034; GFX11-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
2035; GFX11-NEXT:    s_mov_b32 s8, s0
2036; GFX11-NEXT:    s_mov_b32 s9, s1
2037; GFX11-NEXT:    s_waitcnt vmcnt(1)
2038; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2039; GFX11-NEXT:    s_waitcnt vmcnt(0)
2040; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2041; GFX11-NEXT:    v_cmp_le_f16_e32 vcc_lo, v1, v0
2042; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2043; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
2044; GFX11-NEXT:    v_cmp_le_f16_e32 vcc_lo, v3, v2
2045; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2046; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
2047; GFX11-NEXT:    s_endpgm
2048;
2049; GFX12-LABEL: fcmp_v2f16_le:
2050; GFX12:       ; %bb.0: ; %entry
2051; GFX12-NEXT:    s_clause 0x1
2052; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2053; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2054; GFX12-NEXT:    s_mov_b32 s10, -1
2055; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
2056; GFX12-NEXT:    s_mov_b32 s6, s10
2057; GFX12-NEXT:    s_mov_b32 s7, s11
2058; GFX12-NEXT:    s_mov_b32 s14, s10
2059; GFX12-NEXT:    s_mov_b32 s15, s11
2060; GFX12-NEXT:    s_wait_kmcnt 0x0
2061; GFX12-NEXT:    s_mov_b32 s12, s2
2062; GFX12-NEXT:    s_mov_b32 s13, s3
2063; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
2064; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
2065; GFX12-NEXT:    s_mov_b32 s8, s0
2066; GFX12-NEXT:    s_mov_b32 s9, s1
2067; GFX12-NEXT:    s_wait_loadcnt 0x1
2068; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2069; GFX12-NEXT:    s_wait_loadcnt 0x0
2070; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2071; GFX12-NEXT:    v_cmp_le_f16_e32 vcc_lo, v1, v0
2072; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2073; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
2074; GFX12-NEXT:    v_cmp_le_f16_e32 vcc_lo, v3, v2
2075; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2076; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
2077; GFX12-NEXT:    s_endpgm
2078    ptr addrspace(1) %r,
2079    ptr addrspace(1) %a,
2080    ptr addrspace(1) %b) {
2081entry:
2082  %a.val = load <2 x half>, ptr addrspace(1) %a
2083  %b.val = load <2 x half>, ptr addrspace(1) %b
2084  %r.val = fcmp ole <2 x half> %a.val, %b.val
2085  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
2086  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
2087  ret void
2088}
2089
2090define amdgpu_kernel void @fcmp_v2f16_gt(
2091; SI-LABEL: fcmp_v2f16_gt:
2092; SI:       ; %bb.0: ; %entry
2093; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2094; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
2095; SI-NEXT:    s_mov_b32 s11, 0xf000
2096; SI-NEXT:    s_mov_b32 s10, -1
2097; SI-NEXT:    s_mov_b32 s14, s10
2098; SI-NEXT:    s_mov_b32 s15, s11
2099; SI-NEXT:    s_waitcnt lgkmcnt(0)
2100; SI-NEXT:    s_mov_b32 s12, s2
2101; SI-NEXT:    s_mov_b32 s13, s3
2102; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
2103; SI-NEXT:    s_mov_b32 s6, s10
2104; SI-NEXT:    s_mov_b32 s7, s11
2105; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
2106; SI-NEXT:    s_mov_b32 s8, s0
2107; SI-NEXT:    s_mov_b32 s9, s1
2108; SI-NEXT:    s_waitcnt vmcnt(1)
2109; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
2110; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2111; SI-NEXT:    s_waitcnt vmcnt(0)
2112; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
2113; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2114; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
2115; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2116; SI-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v3
2117; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
2118; SI-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v1
2119; SI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
2120; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2121; SI-NEXT:    s_endpgm
2122;
2123; VI-LABEL: fcmp_v2f16_gt:
2124; VI:       ; %bb.0: ; %entry
2125; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2126; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
2127; VI-NEXT:    s_mov_b32 s7, 0xf000
2128; VI-NEXT:    s_mov_b32 s6, -1
2129; VI-NEXT:    s_mov_b32 s10, s6
2130; VI-NEXT:    s_mov_b32 s11, s7
2131; VI-NEXT:    s_waitcnt lgkmcnt(0)
2132; VI-NEXT:    s_mov_b32 s12, s2
2133; VI-NEXT:    s_mov_b32 s13, s3
2134; VI-NEXT:    s_mov_b32 s14, s6
2135; VI-NEXT:    s_mov_b32 s15, s7
2136; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
2137; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
2138; VI-NEXT:    s_mov_b32 s4, s0
2139; VI-NEXT:    s_mov_b32 s5, s1
2140; VI-NEXT:    s_waitcnt vmcnt(1)
2141; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2142; VI-NEXT:    s_waitcnt vmcnt(0)
2143; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2144; VI-NEXT:    v_cmp_gt_f16_e32 vcc, v1, v0
2145; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
2146; VI-NEXT:    v_cmp_gt_f16_e32 vcc, v3, v2
2147; VI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
2148; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2149; VI-NEXT:    s_endpgm
2150;
2151; GFX11-LABEL: fcmp_v2f16_gt:
2152; GFX11:       ; %bb.0: ; %entry
2153; GFX11-NEXT:    s_clause 0x1
2154; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2155; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2156; GFX11-NEXT:    s_mov_b32 s10, -1
2157; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
2158; GFX11-NEXT:    s_mov_b32 s6, s10
2159; GFX11-NEXT:    s_mov_b32 s7, s11
2160; GFX11-NEXT:    s_mov_b32 s14, s10
2161; GFX11-NEXT:    s_mov_b32 s15, s11
2162; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2163; GFX11-NEXT:    s_mov_b32 s12, s2
2164; GFX11-NEXT:    s_mov_b32 s13, s3
2165; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
2166; GFX11-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
2167; GFX11-NEXT:    s_mov_b32 s8, s0
2168; GFX11-NEXT:    s_mov_b32 s9, s1
2169; GFX11-NEXT:    s_waitcnt vmcnt(1)
2170; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2171; GFX11-NEXT:    s_waitcnt vmcnt(0)
2172; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2173; GFX11-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v1, v0
2174; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2175; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
2176; GFX11-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v3, v2
2177; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2178; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
2179; GFX11-NEXT:    s_endpgm
2180;
2181; GFX12-LABEL: fcmp_v2f16_gt:
2182; GFX12:       ; %bb.0: ; %entry
2183; GFX12-NEXT:    s_clause 0x1
2184; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2185; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2186; GFX12-NEXT:    s_mov_b32 s10, -1
2187; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
2188; GFX12-NEXT:    s_mov_b32 s6, s10
2189; GFX12-NEXT:    s_mov_b32 s7, s11
2190; GFX12-NEXT:    s_mov_b32 s14, s10
2191; GFX12-NEXT:    s_mov_b32 s15, s11
2192; GFX12-NEXT:    s_wait_kmcnt 0x0
2193; GFX12-NEXT:    s_mov_b32 s12, s2
2194; GFX12-NEXT:    s_mov_b32 s13, s3
2195; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
2196; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
2197; GFX12-NEXT:    s_mov_b32 s8, s0
2198; GFX12-NEXT:    s_mov_b32 s9, s1
2199; GFX12-NEXT:    s_wait_loadcnt 0x1
2200; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2201; GFX12-NEXT:    s_wait_loadcnt 0x0
2202; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2203; GFX12-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v1, v0
2204; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2205; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
2206; GFX12-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v3, v2
2207; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2208; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
2209; GFX12-NEXT:    s_endpgm
2210    ptr addrspace(1) %r,
2211    ptr addrspace(1) %a,
2212    ptr addrspace(1) %b) {
2213entry:
2214  %a.val = load <2 x half>, ptr addrspace(1) %a
2215  %b.val = load <2 x half>, ptr addrspace(1) %b
2216  %r.val = fcmp ogt <2 x half> %a.val, %b.val
2217  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
2218  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
2219  ret void
2220}
2221
2222
2223define amdgpu_kernel void @fcmp_v2f16_lg(
2224; SI-LABEL: fcmp_v2f16_lg:
2225; SI:       ; %bb.0: ; %entry
2226; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2227; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
2228; SI-NEXT:    s_mov_b32 s11, 0xf000
2229; SI-NEXT:    s_mov_b32 s10, -1
2230; SI-NEXT:    s_mov_b32 s14, s10
2231; SI-NEXT:    s_mov_b32 s15, s11
2232; SI-NEXT:    s_waitcnt lgkmcnt(0)
2233; SI-NEXT:    s_mov_b32 s12, s2
2234; SI-NEXT:    s_mov_b32 s13, s3
2235; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
2236; SI-NEXT:    s_mov_b32 s6, s10
2237; SI-NEXT:    s_mov_b32 s7, s11
2238; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
2239; SI-NEXT:    s_mov_b32 s8, s0
2240; SI-NEXT:    s_mov_b32 s9, s1
2241; SI-NEXT:    s_waitcnt vmcnt(1)
2242; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
2243; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2244; SI-NEXT:    s_waitcnt vmcnt(0)
2245; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
2246; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2247; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
2248; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2249; SI-NEXT:    v_cmp_lg_f32_e32 vcc, v2, v3
2250; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
2251; SI-NEXT:    v_cmp_lg_f32_e32 vcc, v4, v1
2252; SI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
2253; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2254; SI-NEXT:    s_endpgm
2255;
2256; VI-LABEL: fcmp_v2f16_lg:
2257; VI:       ; %bb.0: ; %entry
2258; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2259; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
2260; VI-NEXT:    s_mov_b32 s7, 0xf000
2261; VI-NEXT:    s_mov_b32 s6, -1
2262; VI-NEXT:    s_mov_b32 s10, s6
2263; VI-NEXT:    s_mov_b32 s11, s7
2264; VI-NEXT:    s_waitcnt lgkmcnt(0)
2265; VI-NEXT:    s_mov_b32 s12, s2
2266; VI-NEXT:    s_mov_b32 s13, s3
2267; VI-NEXT:    s_mov_b32 s14, s6
2268; VI-NEXT:    s_mov_b32 s15, s7
2269; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
2270; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
2271; VI-NEXT:    s_mov_b32 s4, s0
2272; VI-NEXT:    s_mov_b32 s5, s1
2273; VI-NEXT:    s_waitcnt vmcnt(1)
2274; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2275; VI-NEXT:    s_waitcnt vmcnt(0)
2276; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2277; VI-NEXT:    v_cmp_lg_f16_e32 vcc, v1, v0
2278; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
2279; VI-NEXT:    v_cmp_lg_f16_e32 vcc, v3, v2
2280; VI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
2281; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2282; VI-NEXT:    s_endpgm
2283;
2284; GFX11-LABEL: fcmp_v2f16_lg:
2285; GFX11:       ; %bb.0: ; %entry
2286; GFX11-NEXT:    s_clause 0x1
2287; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2288; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2289; GFX11-NEXT:    s_mov_b32 s10, -1
2290; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
2291; GFX11-NEXT:    s_mov_b32 s6, s10
2292; GFX11-NEXT:    s_mov_b32 s7, s11
2293; GFX11-NEXT:    s_mov_b32 s14, s10
2294; GFX11-NEXT:    s_mov_b32 s15, s11
2295; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2296; GFX11-NEXT:    s_mov_b32 s12, s2
2297; GFX11-NEXT:    s_mov_b32 s13, s3
2298; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
2299; GFX11-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
2300; GFX11-NEXT:    s_mov_b32 s8, s0
2301; GFX11-NEXT:    s_mov_b32 s9, s1
2302; GFX11-NEXT:    s_waitcnt vmcnt(1)
2303; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2304; GFX11-NEXT:    s_waitcnt vmcnt(0)
2305; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2306; GFX11-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v1, v0
2307; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2308; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
2309; GFX11-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v3, v2
2310; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2311; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
2312; GFX11-NEXT:    s_endpgm
2313;
2314; GFX12-LABEL: fcmp_v2f16_lg:
2315; GFX12:       ; %bb.0: ; %entry
2316; GFX12-NEXT:    s_clause 0x1
2317; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2318; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2319; GFX12-NEXT:    s_mov_b32 s10, -1
2320; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
2321; GFX12-NEXT:    s_mov_b32 s6, s10
2322; GFX12-NEXT:    s_mov_b32 s7, s11
2323; GFX12-NEXT:    s_mov_b32 s14, s10
2324; GFX12-NEXT:    s_mov_b32 s15, s11
2325; GFX12-NEXT:    s_wait_kmcnt 0x0
2326; GFX12-NEXT:    s_mov_b32 s12, s2
2327; GFX12-NEXT:    s_mov_b32 s13, s3
2328; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
2329; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
2330; GFX12-NEXT:    s_mov_b32 s8, s0
2331; GFX12-NEXT:    s_mov_b32 s9, s1
2332; GFX12-NEXT:    s_wait_loadcnt 0x1
2333; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2334; GFX12-NEXT:    s_wait_loadcnt 0x0
2335; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2336; GFX12-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v1, v0
2337; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2338; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
2339; GFX12-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v3, v2
2340; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2341; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
2342; GFX12-NEXT:    s_endpgm
2343    ptr addrspace(1) %r,
2344    ptr addrspace(1) %a,
2345    ptr addrspace(1) %b) {
2346entry:
2347  %a.val = load <2 x half>, ptr addrspace(1) %a
2348  %b.val = load <2 x half>, ptr addrspace(1) %b
2349  %r.val = fcmp one <2 x half> %a.val, %b.val
2350  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
2351  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
2352  ret void
2353}
2354
2355
2356define amdgpu_kernel void @fcmp_v2f16_ge(
2357; SI-LABEL: fcmp_v2f16_ge:
2358; SI:       ; %bb.0: ; %entry
2359; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2360; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
2361; SI-NEXT:    s_mov_b32 s11, 0xf000
2362; SI-NEXT:    s_mov_b32 s10, -1
2363; SI-NEXT:    s_mov_b32 s14, s10
2364; SI-NEXT:    s_mov_b32 s15, s11
2365; SI-NEXT:    s_waitcnt lgkmcnt(0)
2366; SI-NEXT:    s_mov_b32 s12, s2
2367; SI-NEXT:    s_mov_b32 s13, s3
2368; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
2369; SI-NEXT:    s_mov_b32 s6, s10
2370; SI-NEXT:    s_mov_b32 s7, s11
2371; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
2372; SI-NEXT:    s_mov_b32 s8, s0
2373; SI-NEXT:    s_mov_b32 s9, s1
2374; SI-NEXT:    s_waitcnt vmcnt(1)
2375; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
2376; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2377; SI-NEXT:    s_waitcnt vmcnt(0)
2378; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
2379; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2380; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
2381; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2382; SI-NEXT:    v_cmp_ge_f32_e32 vcc, v2, v3
2383; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
2384; SI-NEXT:    v_cmp_ge_f32_e32 vcc, v4, v1
2385; SI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
2386; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2387; SI-NEXT:    s_endpgm
2388;
2389; VI-LABEL: fcmp_v2f16_ge:
2390; VI:       ; %bb.0: ; %entry
2391; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2392; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
2393; VI-NEXT:    s_mov_b32 s7, 0xf000
2394; VI-NEXT:    s_mov_b32 s6, -1
2395; VI-NEXT:    s_mov_b32 s10, s6
2396; VI-NEXT:    s_mov_b32 s11, s7
2397; VI-NEXT:    s_waitcnt lgkmcnt(0)
2398; VI-NEXT:    s_mov_b32 s12, s2
2399; VI-NEXT:    s_mov_b32 s13, s3
2400; VI-NEXT:    s_mov_b32 s14, s6
2401; VI-NEXT:    s_mov_b32 s15, s7
2402; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
2403; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
2404; VI-NEXT:    s_mov_b32 s4, s0
2405; VI-NEXT:    s_mov_b32 s5, s1
2406; VI-NEXT:    s_waitcnt vmcnt(1)
2407; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2408; VI-NEXT:    s_waitcnt vmcnt(0)
2409; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2410; VI-NEXT:    v_cmp_ge_f16_e32 vcc, v1, v0
2411; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
2412; VI-NEXT:    v_cmp_ge_f16_e32 vcc, v3, v2
2413; VI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
2414; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2415; VI-NEXT:    s_endpgm
2416;
2417; GFX11-LABEL: fcmp_v2f16_ge:
2418; GFX11:       ; %bb.0: ; %entry
2419; GFX11-NEXT:    s_clause 0x1
2420; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2421; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2422; GFX11-NEXT:    s_mov_b32 s10, -1
2423; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
2424; GFX11-NEXT:    s_mov_b32 s6, s10
2425; GFX11-NEXT:    s_mov_b32 s7, s11
2426; GFX11-NEXT:    s_mov_b32 s14, s10
2427; GFX11-NEXT:    s_mov_b32 s15, s11
2428; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2429; GFX11-NEXT:    s_mov_b32 s12, s2
2430; GFX11-NEXT:    s_mov_b32 s13, s3
2431; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
2432; GFX11-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
2433; GFX11-NEXT:    s_mov_b32 s8, s0
2434; GFX11-NEXT:    s_mov_b32 s9, s1
2435; GFX11-NEXT:    s_waitcnt vmcnt(1)
2436; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2437; GFX11-NEXT:    s_waitcnt vmcnt(0)
2438; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2439; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v1, v0
2440; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2441; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
2442; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v3, v2
2443; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2444; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
2445; GFX11-NEXT:    s_endpgm
2446;
2447; GFX12-LABEL: fcmp_v2f16_ge:
2448; GFX12:       ; %bb.0: ; %entry
2449; GFX12-NEXT:    s_clause 0x1
2450; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2451; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2452; GFX12-NEXT:    s_mov_b32 s10, -1
2453; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
2454; GFX12-NEXT:    s_mov_b32 s6, s10
2455; GFX12-NEXT:    s_mov_b32 s7, s11
2456; GFX12-NEXT:    s_mov_b32 s14, s10
2457; GFX12-NEXT:    s_mov_b32 s15, s11
2458; GFX12-NEXT:    s_wait_kmcnt 0x0
2459; GFX12-NEXT:    s_mov_b32 s12, s2
2460; GFX12-NEXT:    s_mov_b32 s13, s3
2461; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
2462; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
2463; GFX12-NEXT:    s_mov_b32 s8, s0
2464; GFX12-NEXT:    s_mov_b32 s9, s1
2465; GFX12-NEXT:    s_wait_loadcnt 0x1
2466; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2467; GFX12-NEXT:    s_wait_loadcnt 0x0
2468; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2469; GFX12-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v1, v0
2470; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2471; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
2472; GFX12-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v3, v2
2473; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2474; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
2475; GFX12-NEXT:    s_endpgm
2476    ptr addrspace(1) %r,
2477    ptr addrspace(1) %a,
2478    ptr addrspace(1) %b) {
2479entry:
2480  %a.val = load <2 x half>, ptr addrspace(1) %a
2481  %b.val = load <2 x half>, ptr addrspace(1) %b
2482  %r.val = fcmp oge <2 x half> %a.val, %b.val
2483  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
2484  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
2485  ret void
2486}
2487
2488
2489define amdgpu_kernel void @fcmp_v2f16_o(
2490; SI-LABEL: fcmp_v2f16_o:
2491; SI:       ; %bb.0: ; %entry
2492; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2493; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
2494; SI-NEXT:    s_mov_b32 s11, 0xf000
2495; SI-NEXT:    s_mov_b32 s10, -1
2496; SI-NEXT:    s_mov_b32 s14, s10
2497; SI-NEXT:    s_mov_b32 s15, s11
2498; SI-NEXT:    s_waitcnt lgkmcnt(0)
2499; SI-NEXT:    s_mov_b32 s12, s2
2500; SI-NEXT:    s_mov_b32 s13, s3
2501; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
2502; SI-NEXT:    s_mov_b32 s6, s10
2503; SI-NEXT:    s_mov_b32 s7, s11
2504; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
2505; SI-NEXT:    s_mov_b32 s8, s0
2506; SI-NEXT:    s_mov_b32 s9, s1
2507; SI-NEXT:    s_waitcnt vmcnt(1)
2508; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
2509; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2510; SI-NEXT:    s_waitcnt vmcnt(0)
2511; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
2512; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2513; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
2514; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2515; SI-NEXT:    v_cmp_o_f32_e32 vcc, v2, v3
2516; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
2517; SI-NEXT:    v_cmp_o_f32_e32 vcc, v4, v1
2518; SI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
2519; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2520; SI-NEXT:    s_endpgm
2521;
2522; VI-LABEL: fcmp_v2f16_o:
2523; VI:       ; %bb.0: ; %entry
2524; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2525; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
2526; VI-NEXT:    s_mov_b32 s7, 0xf000
2527; VI-NEXT:    s_mov_b32 s6, -1
2528; VI-NEXT:    s_mov_b32 s10, s6
2529; VI-NEXT:    s_mov_b32 s11, s7
2530; VI-NEXT:    s_waitcnt lgkmcnt(0)
2531; VI-NEXT:    s_mov_b32 s12, s2
2532; VI-NEXT:    s_mov_b32 s13, s3
2533; VI-NEXT:    s_mov_b32 s14, s6
2534; VI-NEXT:    s_mov_b32 s15, s7
2535; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
2536; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
2537; VI-NEXT:    s_mov_b32 s4, s0
2538; VI-NEXT:    s_mov_b32 s5, s1
2539; VI-NEXT:    s_waitcnt vmcnt(1)
2540; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2541; VI-NEXT:    s_waitcnt vmcnt(0)
2542; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2543; VI-NEXT:    v_cmp_o_f16_e32 vcc, v1, v0
2544; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
2545; VI-NEXT:    v_cmp_o_f16_e32 vcc, v3, v2
2546; VI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
2547; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2548; VI-NEXT:    s_endpgm
2549;
2550; GFX11-LABEL: fcmp_v2f16_o:
2551; GFX11:       ; %bb.0: ; %entry
2552; GFX11-NEXT:    s_clause 0x1
2553; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2554; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2555; GFX11-NEXT:    s_mov_b32 s10, -1
2556; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
2557; GFX11-NEXT:    s_mov_b32 s6, s10
2558; GFX11-NEXT:    s_mov_b32 s7, s11
2559; GFX11-NEXT:    s_mov_b32 s14, s10
2560; GFX11-NEXT:    s_mov_b32 s15, s11
2561; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2562; GFX11-NEXT:    s_mov_b32 s12, s2
2563; GFX11-NEXT:    s_mov_b32 s13, s3
2564; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
2565; GFX11-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
2566; GFX11-NEXT:    s_mov_b32 s8, s0
2567; GFX11-NEXT:    s_mov_b32 s9, s1
2568; GFX11-NEXT:    s_waitcnt vmcnt(1)
2569; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2570; GFX11-NEXT:    s_waitcnt vmcnt(0)
2571; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2572; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v0
2573; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2574; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
2575; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v2
2576; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2577; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
2578; GFX11-NEXT:    s_endpgm
2579;
2580; GFX12-LABEL: fcmp_v2f16_o:
2581; GFX12:       ; %bb.0: ; %entry
2582; GFX12-NEXT:    s_clause 0x1
2583; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2584; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2585; GFX12-NEXT:    s_mov_b32 s10, -1
2586; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
2587; GFX12-NEXT:    s_mov_b32 s6, s10
2588; GFX12-NEXT:    s_mov_b32 s7, s11
2589; GFX12-NEXT:    s_mov_b32 s14, s10
2590; GFX12-NEXT:    s_mov_b32 s15, s11
2591; GFX12-NEXT:    s_wait_kmcnt 0x0
2592; GFX12-NEXT:    s_mov_b32 s12, s2
2593; GFX12-NEXT:    s_mov_b32 s13, s3
2594; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
2595; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
2596; GFX12-NEXT:    s_mov_b32 s8, s0
2597; GFX12-NEXT:    s_mov_b32 s9, s1
2598; GFX12-NEXT:    s_wait_loadcnt 0x1
2599; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2600; GFX12-NEXT:    s_wait_loadcnt 0x0
2601; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2602; GFX12-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v0
2603; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2604; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
2605; GFX12-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v2
2606; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2607; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
2608; GFX12-NEXT:    s_endpgm
2609    ptr addrspace(1) %r,
2610    ptr addrspace(1) %a,
2611    ptr addrspace(1) %b) {
2612entry:
2613  %a.val = load <2 x half>, ptr addrspace(1) %a
2614  %b.val = load <2 x half>, ptr addrspace(1) %b
2615  %r.val = fcmp ord <2 x half> %a.val, %b.val
2616  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
2617  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
2618  ret void
2619}
2620
2621
2622define amdgpu_kernel void @fcmp_v2f16_u(
2623; SI-LABEL: fcmp_v2f16_u:
2624; SI:       ; %bb.0: ; %entry
2625; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2626; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
2627; SI-NEXT:    s_mov_b32 s11, 0xf000
2628; SI-NEXT:    s_mov_b32 s10, -1
2629; SI-NEXT:    s_mov_b32 s14, s10
2630; SI-NEXT:    s_mov_b32 s15, s11
2631; SI-NEXT:    s_waitcnt lgkmcnt(0)
2632; SI-NEXT:    s_mov_b32 s12, s2
2633; SI-NEXT:    s_mov_b32 s13, s3
2634; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
2635; SI-NEXT:    s_mov_b32 s6, s10
2636; SI-NEXT:    s_mov_b32 s7, s11
2637; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
2638; SI-NEXT:    s_mov_b32 s8, s0
2639; SI-NEXT:    s_mov_b32 s9, s1
2640; SI-NEXT:    s_waitcnt vmcnt(1)
2641; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
2642; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2643; SI-NEXT:    s_waitcnt vmcnt(0)
2644; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
2645; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2646; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
2647; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2648; SI-NEXT:    v_cmp_u_f32_e32 vcc, v2, v3
2649; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
2650; SI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v1
2651; SI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
2652; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2653; SI-NEXT:    s_endpgm
2654;
2655; VI-LABEL: fcmp_v2f16_u:
2656; VI:       ; %bb.0: ; %entry
2657; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2658; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
2659; VI-NEXT:    s_mov_b32 s7, 0xf000
2660; VI-NEXT:    s_mov_b32 s6, -1
2661; VI-NEXT:    s_mov_b32 s10, s6
2662; VI-NEXT:    s_mov_b32 s11, s7
2663; VI-NEXT:    s_waitcnt lgkmcnt(0)
2664; VI-NEXT:    s_mov_b32 s12, s2
2665; VI-NEXT:    s_mov_b32 s13, s3
2666; VI-NEXT:    s_mov_b32 s14, s6
2667; VI-NEXT:    s_mov_b32 s15, s7
2668; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
2669; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
2670; VI-NEXT:    s_mov_b32 s4, s0
2671; VI-NEXT:    s_mov_b32 s5, s1
2672; VI-NEXT:    s_waitcnt vmcnt(1)
2673; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2674; VI-NEXT:    s_waitcnt vmcnt(0)
2675; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2676; VI-NEXT:    v_cmp_u_f16_e32 vcc, v1, v0
2677; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
2678; VI-NEXT:    v_cmp_u_f16_e32 vcc, v3, v2
2679; VI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
2680; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2681; VI-NEXT:    s_endpgm
2682;
2683; GFX11-LABEL: fcmp_v2f16_u:
2684; GFX11:       ; %bb.0: ; %entry
2685; GFX11-NEXT:    s_clause 0x1
2686; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2687; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2688; GFX11-NEXT:    s_mov_b32 s10, -1
2689; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
2690; GFX11-NEXT:    s_mov_b32 s6, s10
2691; GFX11-NEXT:    s_mov_b32 s7, s11
2692; GFX11-NEXT:    s_mov_b32 s14, s10
2693; GFX11-NEXT:    s_mov_b32 s15, s11
2694; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2695; GFX11-NEXT:    s_mov_b32 s12, s2
2696; GFX11-NEXT:    s_mov_b32 s13, s3
2697; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
2698; GFX11-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
2699; GFX11-NEXT:    s_mov_b32 s8, s0
2700; GFX11-NEXT:    s_mov_b32 s9, s1
2701; GFX11-NEXT:    s_waitcnt vmcnt(1)
2702; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2703; GFX11-NEXT:    s_waitcnt vmcnt(0)
2704; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2705; GFX11-NEXT:    v_cmp_u_f16_e32 vcc_lo, v1, v0
2706; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2707; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
2708; GFX11-NEXT:    v_cmp_u_f16_e32 vcc_lo, v3, v2
2709; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2710; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
2711; GFX11-NEXT:    s_endpgm
2712;
2713; GFX12-LABEL: fcmp_v2f16_u:
2714; GFX12:       ; %bb.0: ; %entry
2715; GFX12-NEXT:    s_clause 0x1
2716; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2717; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2718; GFX12-NEXT:    s_mov_b32 s10, -1
2719; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
2720; GFX12-NEXT:    s_mov_b32 s6, s10
2721; GFX12-NEXT:    s_mov_b32 s7, s11
2722; GFX12-NEXT:    s_mov_b32 s14, s10
2723; GFX12-NEXT:    s_mov_b32 s15, s11
2724; GFX12-NEXT:    s_wait_kmcnt 0x0
2725; GFX12-NEXT:    s_mov_b32 s12, s2
2726; GFX12-NEXT:    s_mov_b32 s13, s3
2727; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
2728; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
2729; GFX12-NEXT:    s_mov_b32 s8, s0
2730; GFX12-NEXT:    s_mov_b32 s9, s1
2731; GFX12-NEXT:    s_wait_loadcnt 0x1
2732; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2733; GFX12-NEXT:    s_wait_loadcnt 0x0
2734; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2735; GFX12-NEXT:    v_cmp_u_f16_e32 vcc_lo, v1, v0
2736; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2737; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
2738; GFX12-NEXT:    v_cmp_u_f16_e32 vcc_lo, v3, v2
2739; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2740; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
2741; GFX12-NEXT:    s_endpgm
2742    ptr addrspace(1) %r,
2743    ptr addrspace(1) %a,
2744    ptr addrspace(1) %b) {
2745entry:
2746  %a.val = load <2 x half>, ptr addrspace(1) %a
2747  %b.val = load <2 x half>, ptr addrspace(1) %b
2748  %r.val = fcmp uno <2 x half> %a.val, %b.val
2749  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
2750  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
2751  ret void
2752}
2753
2754define amdgpu_kernel void @fcmp_v2f16_nge(
2755; SI-LABEL: fcmp_v2f16_nge:
2756; SI:       ; %bb.0: ; %entry
2757; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2758; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
2759; SI-NEXT:    s_mov_b32 s11, 0xf000
2760; SI-NEXT:    s_mov_b32 s10, -1
2761; SI-NEXT:    s_mov_b32 s14, s10
2762; SI-NEXT:    s_mov_b32 s15, s11
2763; SI-NEXT:    s_waitcnt lgkmcnt(0)
2764; SI-NEXT:    s_mov_b32 s12, s2
2765; SI-NEXT:    s_mov_b32 s13, s3
2766; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
2767; SI-NEXT:    s_mov_b32 s6, s10
2768; SI-NEXT:    s_mov_b32 s7, s11
2769; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
2770; SI-NEXT:    s_mov_b32 s8, s0
2771; SI-NEXT:    s_mov_b32 s9, s1
2772; SI-NEXT:    s_waitcnt vmcnt(1)
2773; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
2774; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2775; SI-NEXT:    s_waitcnt vmcnt(0)
2776; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
2777; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2778; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
2779; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2780; SI-NEXT:    v_cmp_nge_f32_e32 vcc, v2, v3
2781; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
2782; SI-NEXT:    v_cmp_nge_f32_e32 vcc, v4, v1
2783; SI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
2784; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2785; SI-NEXT:    s_endpgm
2786;
2787; VI-LABEL: fcmp_v2f16_nge:
2788; VI:       ; %bb.0: ; %entry
2789; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2790; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
2791; VI-NEXT:    s_mov_b32 s7, 0xf000
2792; VI-NEXT:    s_mov_b32 s6, -1
2793; VI-NEXT:    s_mov_b32 s10, s6
2794; VI-NEXT:    s_mov_b32 s11, s7
2795; VI-NEXT:    s_waitcnt lgkmcnt(0)
2796; VI-NEXT:    s_mov_b32 s12, s2
2797; VI-NEXT:    s_mov_b32 s13, s3
2798; VI-NEXT:    s_mov_b32 s14, s6
2799; VI-NEXT:    s_mov_b32 s15, s7
2800; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
2801; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
2802; VI-NEXT:    s_mov_b32 s4, s0
2803; VI-NEXT:    s_mov_b32 s5, s1
2804; VI-NEXT:    s_waitcnt vmcnt(1)
2805; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2806; VI-NEXT:    s_waitcnt vmcnt(0)
2807; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2808; VI-NEXT:    v_cmp_nge_f16_e32 vcc, v1, v0
2809; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
2810; VI-NEXT:    v_cmp_nge_f16_e32 vcc, v3, v2
2811; VI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
2812; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2813; VI-NEXT:    s_endpgm
2814;
2815; GFX11-LABEL: fcmp_v2f16_nge:
2816; GFX11:       ; %bb.0: ; %entry
2817; GFX11-NEXT:    s_clause 0x1
2818; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2819; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2820; GFX11-NEXT:    s_mov_b32 s10, -1
2821; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
2822; GFX11-NEXT:    s_mov_b32 s6, s10
2823; GFX11-NEXT:    s_mov_b32 s7, s11
2824; GFX11-NEXT:    s_mov_b32 s14, s10
2825; GFX11-NEXT:    s_mov_b32 s15, s11
2826; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2827; GFX11-NEXT:    s_mov_b32 s12, s2
2828; GFX11-NEXT:    s_mov_b32 s13, s3
2829; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
2830; GFX11-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
2831; GFX11-NEXT:    s_mov_b32 s8, s0
2832; GFX11-NEXT:    s_mov_b32 s9, s1
2833; GFX11-NEXT:    s_waitcnt vmcnt(1)
2834; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2835; GFX11-NEXT:    s_waitcnt vmcnt(0)
2836; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2837; GFX11-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v1, v0
2838; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2839; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
2840; GFX11-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v3, v2
2841; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2842; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
2843; GFX11-NEXT:    s_endpgm
2844;
2845; GFX12-LABEL: fcmp_v2f16_nge:
2846; GFX12:       ; %bb.0: ; %entry
2847; GFX12-NEXT:    s_clause 0x1
2848; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2849; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2850; GFX12-NEXT:    s_mov_b32 s10, -1
2851; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
2852; GFX12-NEXT:    s_mov_b32 s6, s10
2853; GFX12-NEXT:    s_mov_b32 s7, s11
2854; GFX12-NEXT:    s_mov_b32 s14, s10
2855; GFX12-NEXT:    s_mov_b32 s15, s11
2856; GFX12-NEXT:    s_wait_kmcnt 0x0
2857; GFX12-NEXT:    s_mov_b32 s12, s2
2858; GFX12-NEXT:    s_mov_b32 s13, s3
2859; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
2860; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
2861; GFX12-NEXT:    s_mov_b32 s8, s0
2862; GFX12-NEXT:    s_mov_b32 s9, s1
2863; GFX12-NEXT:    s_wait_loadcnt 0x1
2864; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2865; GFX12-NEXT:    s_wait_loadcnt 0x0
2866; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2867; GFX12-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v1, v0
2868; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2869; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
2870; GFX12-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v3, v2
2871; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2872; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
2873; GFX12-NEXT:    s_endpgm
2874    ptr addrspace(1) %r,
2875    ptr addrspace(1) %a,
2876    ptr addrspace(1) %b) {
2877entry:
2878  %a.val = load <2 x half>, ptr addrspace(1) %a
2879  %b.val = load <2 x half>, ptr addrspace(1) %b
2880  %r.val = fcmp ult <2 x half> %a.val, %b.val
2881  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
2882  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
2883  ret void
2884}
2885
2886define amdgpu_kernel void @fcmp_v2f16_nlg(
2887; SI-LABEL: fcmp_v2f16_nlg:
2888; SI:       ; %bb.0: ; %entry
2889; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2890; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
2891; SI-NEXT:    s_mov_b32 s11, 0xf000
2892; SI-NEXT:    s_mov_b32 s10, -1
2893; SI-NEXT:    s_mov_b32 s14, s10
2894; SI-NEXT:    s_mov_b32 s15, s11
2895; SI-NEXT:    s_waitcnt lgkmcnt(0)
2896; SI-NEXT:    s_mov_b32 s12, s2
2897; SI-NEXT:    s_mov_b32 s13, s3
2898; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
2899; SI-NEXT:    s_mov_b32 s6, s10
2900; SI-NEXT:    s_mov_b32 s7, s11
2901; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
2902; SI-NEXT:    s_mov_b32 s8, s0
2903; SI-NEXT:    s_mov_b32 s9, s1
2904; SI-NEXT:    s_waitcnt vmcnt(1)
2905; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
2906; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2907; SI-NEXT:    s_waitcnt vmcnt(0)
2908; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
2909; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2910; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
2911; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2912; SI-NEXT:    v_cmp_nlg_f32_e32 vcc, v2, v3
2913; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
2914; SI-NEXT:    v_cmp_nlg_f32_e32 vcc, v4, v1
2915; SI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
2916; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2917; SI-NEXT:    s_endpgm
2918;
2919; VI-LABEL: fcmp_v2f16_nlg:
2920; VI:       ; %bb.0: ; %entry
2921; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2922; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
2923; VI-NEXT:    s_mov_b32 s7, 0xf000
2924; VI-NEXT:    s_mov_b32 s6, -1
2925; VI-NEXT:    s_mov_b32 s10, s6
2926; VI-NEXT:    s_mov_b32 s11, s7
2927; VI-NEXT:    s_waitcnt lgkmcnt(0)
2928; VI-NEXT:    s_mov_b32 s12, s2
2929; VI-NEXT:    s_mov_b32 s13, s3
2930; VI-NEXT:    s_mov_b32 s14, s6
2931; VI-NEXT:    s_mov_b32 s15, s7
2932; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
2933; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
2934; VI-NEXT:    s_mov_b32 s4, s0
2935; VI-NEXT:    s_mov_b32 s5, s1
2936; VI-NEXT:    s_waitcnt vmcnt(1)
2937; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2938; VI-NEXT:    s_waitcnt vmcnt(0)
2939; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2940; VI-NEXT:    v_cmp_nlg_f16_e32 vcc, v1, v0
2941; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
2942; VI-NEXT:    v_cmp_nlg_f16_e32 vcc, v3, v2
2943; VI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
2944; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2945; VI-NEXT:    s_endpgm
2946;
2947; GFX11-LABEL: fcmp_v2f16_nlg:
2948; GFX11:       ; %bb.0: ; %entry
2949; GFX11-NEXT:    s_clause 0x1
2950; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2951; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2952; GFX11-NEXT:    s_mov_b32 s10, -1
2953; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
2954; GFX11-NEXT:    s_mov_b32 s6, s10
2955; GFX11-NEXT:    s_mov_b32 s7, s11
2956; GFX11-NEXT:    s_mov_b32 s14, s10
2957; GFX11-NEXT:    s_mov_b32 s15, s11
2958; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2959; GFX11-NEXT:    s_mov_b32 s12, s2
2960; GFX11-NEXT:    s_mov_b32 s13, s3
2961; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
2962; GFX11-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
2963; GFX11-NEXT:    s_mov_b32 s8, s0
2964; GFX11-NEXT:    s_mov_b32 s9, s1
2965; GFX11-NEXT:    s_waitcnt vmcnt(1)
2966; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2967; GFX11-NEXT:    s_waitcnt vmcnt(0)
2968; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2969; GFX11-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v1, v0
2970; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2971; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
2972; GFX11-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v3, v2
2973; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2974; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
2975; GFX11-NEXT:    s_endpgm
2976;
2977; GFX12-LABEL: fcmp_v2f16_nlg:
2978; GFX12:       ; %bb.0: ; %entry
2979; GFX12-NEXT:    s_clause 0x1
2980; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2981; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2982; GFX12-NEXT:    s_mov_b32 s10, -1
2983; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
2984; GFX12-NEXT:    s_mov_b32 s6, s10
2985; GFX12-NEXT:    s_mov_b32 s7, s11
2986; GFX12-NEXT:    s_mov_b32 s14, s10
2987; GFX12-NEXT:    s_mov_b32 s15, s11
2988; GFX12-NEXT:    s_wait_kmcnt 0x0
2989; GFX12-NEXT:    s_mov_b32 s12, s2
2990; GFX12-NEXT:    s_mov_b32 s13, s3
2991; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
2992; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
2993; GFX12-NEXT:    s_mov_b32 s8, s0
2994; GFX12-NEXT:    s_mov_b32 s9, s1
2995; GFX12-NEXT:    s_wait_loadcnt 0x1
2996; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2997; GFX12-NEXT:    s_wait_loadcnt 0x0
2998; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2999; GFX12-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v1, v0
3000; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3001; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
3002; GFX12-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v3, v2
3003; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3004; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
3005; GFX12-NEXT:    s_endpgm
3006    ptr addrspace(1) %r,
3007    ptr addrspace(1) %a,
3008    ptr addrspace(1) %b) {
3009entry:
3010  %a.val = load <2 x half>, ptr addrspace(1) %a
3011  %b.val = load <2 x half>, ptr addrspace(1) %b
3012  %r.val = fcmp ueq <2 x half> %a.val, %b.val
3013  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
3014  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
3015  ret void
3016}
3017
3018
3019define amdgpu_kernel void @fcmp_v2f16_ngt(
3020; SI-LABEL: fcmp_v2f16_ngt:
3021; SI:       ; %bb.0: ; %entry
3022; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3023; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
3024; SI-NEXT:    s_mov_b32 s11, 0xf000
3025; SI-NEXT:    s_mov_b32 s10, -1
3026; SI-NEXT:    s_mov_b32 s14, s10
3027; SI-NEXT:    s_mov_b32 s15, s11
3028; SI-NEXT:    s_waitcnt lgkmcnt(0)
3029; SI-NEXT:    s_mov_b32 s12, s2
3030; SI-NEXT:    s_mov_b32 s13, s3
3031; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
3032; SI-NEXT:    s_mov_b32 s6, s10
3033; SI-NEXT:    s_mov_b32 s7, s11
3034; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
3035; SI-NEXT:    s_mov_b32 s8, s0
3036; SI-NEXT:    s_mov_b32 s9, s1
3037; SI-NEXT:    s_waitcnt vmcnt(1)
3038; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
3039; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
3040; SI-NEXT:    s_waitcnt vmcnt(0)
3041; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
3042; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
3043; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
3044; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
3045; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, v2, v3
3046; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
3047; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, v4, v1
3048; SI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
3049; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
3050; SI-NEXT:    s_endpgm
3051;
3052; VI-LABEL: fcmp_v2f16_ngt:
3053; VI:       ; %bb.0: ; %entry
3054; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3055; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
3056; VI-NEXT:    s_mov_b32 s7, 0xf000
3057; VI-NEXT:    s_mov_b32 s6, -1
3058; VI-NEXT:    s_mov_b32 s10, s6
3059; VI-NEXT:    s_mov_b32 s11, s7
3060; VI-NEXT:    s_waitcnt lgkmcnt(0)
3061; VI-NEXT:    s_mov_b32 s12, s2
3062; VI-NEXT:    s_mov_b32 s13, s3
3063; VI-NEXT:    s_mov_b32 s14, s6
3064; VI-NEXT:    s_mov_b32 s15, s7
3065; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
3066; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
3067; VI-NEXT:    s_mov_b32 s4, s0
3068; VI-NEXT:    s_mov_b32 s5, s1
3069; VI-NEXT:    s_waitcnt vmcnt(1)
3070; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
3071; VI-NEXT:    s_waitcnt vmcnt(0)
3072; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
3073; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v0
3074; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
3075; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v2
3076; VI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
3077; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3078; VI-NEXT:    s_endpgm
3079;
3080; GFX11-LABEL: fcmp_v2f16_ngt:
3081; GFX11:       ; %bb.0: ; %entry
3082; GFX11-NEXT:    s_clause 0x1
3083; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3084; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
3085; GFX11-NEXT:    s_mov_b32 s10, -1
3086; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
3087; GFX11-NEXT:    s_mov_b32 s6, s10
3088; GFX11-NEXT:    s_mov_b32 s7, s11
3089; GFX11-NEXT:    s_mov_b32 s14, s10
3090; GFX11-NEXT:    s_mov_b32 s15, s11
3091; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3092; GFX11-NEXT:    s_mov_b32 s12, s2
3093; GFX11-NEXT:    s_mov_b32 s13, s3
3094; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
3095; GFX11-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
3096; GFX11-NEXT:    s_mov_b32 s8, s0
3097; GFX11-NEXT:    s_mov_b32 s9, s1
3098; GFX11-NEXT:    s_waitcnt vmcnt(1)
3099; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
3100; GFX11-NEXT:    s_waitcnt vmcnt(0)
3101; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
3102; GFX11-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v0
3103; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3104; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
3105; GFX11-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
3106; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3107; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
3108; GFX11-NEXT:    s_endpgm
3109;
3110; GFX12-LABEL: fcmp_v2f16_ngt:
3111; GFX12:       ; %bb.0: ; %entry
3112; GFX12-NEXT:    s_clause 0x1
3113; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3114; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
3115; GFX12-NEXT:    s_mov_b32 s10, -1
3116; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
3117; GFX12-NEXT:    s_mov_b32 s6, s10
3118; GFX12-NEXT:    s_mov_b32 s7, s11
3119; GFX12-NEXT:    s_mov_b32 s14, s10
3120; GFX12-NEXT:    s_mov_b32 s15, s11
3121; GFX12-NEXT:    s_wait_kmcnt 0x0
3122; GFX12-NEXT:    s_mov_b32 s12, s2
3123; GFX12-NEXT:    s_mov_b32 s13, s3
3124; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
3125; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
3126; GFX12-NEXT:    s_mov_b32 s8, s0
3127; GFX12-NEXT:    s_mov_b32 s9, s1
3128; GFX12-NEXT:    s_wait_loadcnt 0x1
3129; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
3130; GFX12-NEXT:    s_wait_loadcnt 0x0
3131; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
3132; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v0
3133; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3134; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
3135; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
3136; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3137; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
3138; GFX12-NEXT:    s_endpgm
3139    ptr addrspace(1) %r,
3140    ptr addrspace(1) %a,
3141    ptr addrspace(1) %b) {
3142entry:
3143  %a.val = load <2 x half>, ptr addrspace(1) %a
3144  %b.val = load <2 x half>, ptr addrspace(1) %b
3145  %r.val = fcmp ule <2 x half> %a.val, %b.val
3146  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
3147  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
3148  ret void
3149}
3150
3151define amdgpu_kernel void @fcmp_v2f16_nle(
3152; SI-LABEL: fcmp_v2f16_nle:
3153; SI:       ; %bb.0: ; %entry
3154; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3155; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
3156; SI-NEXT:    s_mov_b32 s11, 0xf000
3157; SI-NEXT:    s_mov_b32 s10, -1
3158; SI-NEXT:    s_mov_b32 s14, s10
3159; SI-NEXT:    s_mov_b32 s15, s11
3160; SI-NEXT:    s_waitcnt lgkmcnt(0)
3161; SI-NEXT:    s_mov_b32 s12, s2
3162; SI-NEXT:    s_mov_b32 s13, s3
3163; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
3164; SI-NEXT:    s_mov_b32 s6, s10
3165; SI-NEXT:    s_mov_b32 s7, s11
3166; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
3167; SI-NEXT:    s_mov_b32 s8, s0
3168; SI-NEXT:    s_mov_b32 s9, s1
3169; SI-NEXT:    s_waitcnt vmcnt(1)
3170; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
3171; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
3172; SI-NEXT:    s_waitcnt vmcnt(0)
3173; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
3174; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
3175; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
3176; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
3177; SI-NEXT:    v_cmp_nle_f32_e32 vcc, v2, v3
3178; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
3179; SI-NEXT:    v_cmp_nle_f32_e32 vcc, v4, v1
3180; SI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
3181; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
3182; SI-NEXT:    s_endpgm
3183;
3184; VI-LABEL: fcmp_v2f16_nle:
3185; VI:       ; %bb.0: ; %entry
3186; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3187; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
3188; VI-NEXT:    s_mov_b32 s7, 0xf000
3189; VI-NEXT:    s_mov_b32 s6, -1
3190; VI-NEXT:    s_mov_b32 s10, s6
3191; VI-NEXT:    s_mov_b32 s11, s7
3192; VI-NEXT:    s_waitcnt lgkmcnt(0)
3193; VI-NEXT:    s_mov_b32 s12, s2
3194; VI-NEXT:    s_mov_b32 s13, s3
3195; VI-NEXT:    s_mov_b32 s14, s6
3196; VI-NEXT:    s_mov_b32 s15, s7
3197; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
3198; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
3199; VI-NEXT:    s_mov_b32 s4, s0
3200; VI-NEXT:    s_mov_b32 s5, s1
3201; VI-NEXT:    s_waitcnt vmcnt(1)
3202; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
3203; VI-NEXT:    s_waitcnt vmcnt(0)
3204; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
3205; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v0
3206; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
3207; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v3, v2
3208; VI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
3209; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3210; VI-NEXT:    s_endpgm
3211;
3212; GFX11-LABEL: fcmp_v2f16_nle:
3213; GFX11:       ; %bb.0: ; %entry
3214; GFX11-NEXT:    s_clause 0x1
3215; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3216; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
3217; GFX11-NEXT:    s_mov_b32 s10, -1
3218; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
3219; GFX11-NEXT:    s_mov_b32 s6, s10
3220; GFX11-NEXT:    s_mov_b32 s7, s11
3221; GFX11-NEXT:    s_mov_b32 s14, s10
3222; GFX11-NEXT:    s_mov_b32 s15, s11
3223; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3224; GFX11-NEXT:    s_mov_b32 s12, s2
3225; GFX11-NEXT:    s_mov_b32 s13, s3
3226; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
3227; GFX11-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
3228; GFX11-NEXT:    s_mov_b32 s8, s0
3229; GFX11-NEXT:    s_mov_b32 s9, s1
3230; GFX11-NEXT:    s_waitcnt vmcnt(1)
3231; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
3232; GFX11-NEXT:    s_waitcnt vmcnt(0)
3233; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
3234; GFX11-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1, v0
3235; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3236; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
3237; GFX11-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v3, v2
3238; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3239; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
3240; GFX11-NEXT:    s_endpgm
3241;
3242; GFX12-LABEL: fcmp_v2f16_nle:
3243; GFX12:       ; %bb.0: ; %entry
3244; GFX12-NEXT:    s_clause 0x1
3245; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3246; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
3247; GFX12-NEXT:    s_mov_b32 s10, -1
3248; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
3249; GFX12-NEXT:    s_mov_b32 s6, s10
3250; GFX12-NEXT:    s_mov_b32 s7, s11
3251; GFX12-NEXT:    s_mov_b32 s14, s10
3252; GFX12-NEXT:    s_mov_b32 s15, s11
3253; GFX12-NEXT:    s_wait_kmcnt 0x0
3254; GFX12-NEXT:    s_mov_b32 s12, s2
3255; GFX12-NEXT:    s_mov_b32 s13, s3
3256; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
3257; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
3258; GFX12-NEXT:    s_mov_b32 s8, s0
3259; GFX12-NEXT:    s_mov_b32 s9, s1
3260; GFX12-NEXT:    s_wait_loadcnt 0x1
3261; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
3262; GFX12-NEXT:    s_wait_loadcnt 0x0
3263; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
3264; GFX12-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1, v0
3265; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3266; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
3267; GFX12-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v3, v2
3268; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3269; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
3270; GFX12-NEXT:    s_endpgm
3271    ptr addrspace(1) %r,
3272    ptr addrspace(1) %a,
3273    ptr addrspace(1) %b) {
3274entry:
3275  %a.val = load <2 x half>, ptr addrspace(1) %a
3276  %b.val = load <2 x half>, ptr addrspace(1) %b
3277  %r.val = fcmp ugt <2 x half> %a.val, %b.val
3278  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
3279  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
3280  ret void
3281}
3282
3283define amdgpu_kernel void @fcmp_v2f16_neq(
3284; SI-LABEL: fcmp_v2f16_neq:
3285; SI:       ; %bb.0: ; %entry
3286; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3287; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
3288; SI-NEXT:    s_mov_b32 s11, 0xf000
3289; SI-NEXT:    s_mov_b32 s10, -1
3290; SI-NEXT:    s_mov_b32 s14, s10
3291; SI-NEXT:    s_mov_b32 s15, s11
3292; SI-NEXT:    s_waitcnt lgkmcnt(0)
3293; SI-NEXT:    s_mov_b32 s12, s2
3294; SI-NEXT:    s_mov_b32 s13, s3
3295; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
3296; SI-NEXT:    s_mov_b32 s6, s10
3297; SI-NEXT:    s_mov_b32 s7, s11
3298; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
3299; SI-NEXT:    s_mov_b32 s8, s0
3300; SI-NEXT:    s_mov_b32 s9, s1
3301; SI-NEXT:    s_waitcnt vmcnt(1)
3302; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
3303; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
3304; SI-NEXT:    s_waitcnt vmcnt(0)
3305; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
3306; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
3307; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
3308; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
3309; SI-NEXT:    v_cmp_neq_f32_e32 vcc, v2, v3
3310; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
3311; SI-NEXT:    v_cmp_neq_f32_e32 vcc, v4, v1
3312; SI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
3313; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
3314; SI-NEXT:    s_endpgm
3315;
3316; VI-LABEL: fcmp_v2f16_neq:
3317; VI:       ; %bb.0: ; %entry
3318; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3319; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
3320; VI-NEXT:    s_mov_b32 s7, 0xf000
3321; VI-NEXT:    s_mov_b32 s6, -1
3322; VI-NEXT:    s_mov_b32 s10, s6
3323; VI-NEXT:    s_mov_b32 s11, s7
3324; VI-NEXT:    s_waitcnt lgkmcnt(0)
3325; VI-NEXT:    s_mov_b32 s12, s2
3326; VI-NEXT:    s_mov_b32 s13, s3
3327; VI-NEXT:    s_mov_b32 s14, s6
3328; VI-NEXT:    s_mov_b32 s15, s7
3329; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
3330; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
3331; VI-NEXT:    s_mov_b32 s4, s0
3332; VI-NEXT:    s_mov_b32 s5, s1
3333; VI-NEXT:    s_waitcnt vmcnt(1)
3334; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
3335; VI-NEXT:    s_waitcnt vmcnt(0)
3336; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
3337; VI-NEXT:    v_cmp_neq_f16_e32 vcc, v1, v0
3338; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
3339; VI-NEXT:    v_cmp_neq_f16_e32 vcc, v3, v2
3340; VI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
3341; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3342; VI-NEXT:    s_endpgm
3343;
3344; GFX11-LABEL: fcmp_v2f16_neq:
3345; GFX11:       ; %bb.0: ; %entry
3346; GFX11-NEXT:    s_clause 0x1
3347; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3348; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
3349; GFX11-NEXT:    s_mov_b32 s10, -1
3350; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
3351; GFX11-NEXT:    s_mov_b32 s6, s10
3352; GFX11-NEXT:    s_mov_b32 s7, s11
3353; GFX11-NEXT:    s_mov_b32 s14, s10
3354; GFX11-NEXT:    s_mov_b32 s15, s11
3355; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3356; GFX11-NEXT:    s_mov_b32 s12, s2
3357; GFX11-NEXT:    s_mov_b32 s13, s3
3358; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
3359; GFX11-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
3360; GFX11-NEXT:    s_mov_b32 s8, s0
3361; GFX11-NEXT:    s_mov_b32 s9, s1
3362; GFX11-NEXT:    s_waitcnt vmcnt(1)
3363; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
3364; GFX11-NEXT:    s_waitcnt vmcnt(0)
3365; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
3366; GFX11-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v1, v0
3367; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3368; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
3369; GFX11-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v3, v2
3370; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3371; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
3372; GFX11-NEXT:    s_endpgm
3373;
3374; GFX12-LABEL: fcmp_v2f16_neq:
3375; GFX12:       ; %bb.0: ; %entry
3376; GFX12-NEXT:    s_clause 0x1
3377; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3378; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
3379; GFX12-NEXT:    s_mov_b32 s10, -1
3380; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
3381; GFX12-NEXT:    s_mov_b32 s6, s10
3382; GFX12-NEXT:    s_mov_b32 s7, s11
3383; GFX12-NEXT:    s_mov_b32 s14, s10
3384; GFX12-NEXT:    s_mov_b32 s15, s11
3385; GFX12-NEXT:    s_wait_kmcnt 0x0
3386; GFX12-NEXT:    s_mov_b32 s12, s2
3387; GFX12-NEXT:    s_mov_b32 s13, s3
3388; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
3389; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
3390; GFX12-NEXT:    s_mov_b32 s8, s0
3391; GFX12-NEXT:    s_mov_b32 s9, s1
3392; GFX12-NEXT:    s_wait_loadcnt 0x1
3393; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
3394; GFX12-NEXT:    s_wait_loadcnt 0x0
3395; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
3396; GFX12-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v1, v0
3397; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3398; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
3399; GFX12-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v3, v2
3400; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3401; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
3402; GFX12-NEXT:    s_endpgm
3403    ptr addrspace(1) %r,
3404    ptr addrspace(1) %a,
3405    ptr addrspace(1) %b) {
3406entry:
3407  %a.val = load <2 x half>, ptr addrspace(1) %a
3408  %b.val = load <2 x half>, ptr addrspace(1) %b
3409  %r.val = fcmp une <2 x half> %a.val, %b.val
3410  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
3411  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
3412  ret void
3413}
3414
3415define amdgpu_kernel void @fcmp_v2f16_nlt(
3416; SI-LABEL: fcmp_v2f16_nlt:
3417; SI:       ; %bb.0: ; %entry
3418; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3419; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
3420; SI-NEXT:    s_mov_b32 s11, 0xf000
3421; SI-NEXT:    s_mov_b32 s10, -1
3422; SI-NEXT:    s_mov_b32 s14, s10
3423; SI-NEXT:    s_mov_b32 s15, s11
3424; SI-NEXT:    s_waitcnt lgkmcnt(0)
3425; SI-NEXT:    s_mov_b32 s12, s2
3426; SI-NEXT:    s_mov_b32 s13, s3
3427; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
3428; SI-NEXT:    s_mov_b32 s6, s10
3429; SI-NEXT:    s_mov_b32 s7, s11
3430; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
3431; SI-NEXT:    s_mov_b32 s8, s0
3432; SI-NEXT:    s_mov_b32 s9, s1
3433; SI-NEXT:    s_waitcnt vmcnt(1)
3434; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
3435; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
3436; SI-NEXT:    s_waitcnt vmcnt(0)
3437; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
3438; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
3439; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
3440; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
3441; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v2, v3
3442; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
3443; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v4, v1
3444; SI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
3445; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
3446; SI-NEXT:    s_endpgm
3447;
3448; VI-LABEL: fcmp_v2f16_nlt:
3449; VI:       ; %bb.0: ; %entry
3450; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3451; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
3452; VI-NEXT:    s_mov_b32 s7, 0xf000
3453; VI-NEXT:    s_mov_b32 s6, -1
3454; VI-NEXT:    s_mov_b32 s10, s6
3455; VI-NEXT:    s_mov_b32 s11, s7
3456; VI-NEXT:    s_waitcnt lgkmcnt(0)
3457; VI-NEXT:    s_mov_b32 s12, s2
3458; VI-NEXT:    s_mov_b32 s13, s3
3459; VI-NEXT:    s_mov_b32 s14, s6
3460; VI-NEXT:    s_mov_b32 s15, s7
3461; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
3462; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
3463; VI-NEXT:    s_mov_b32 s4, s0
3464; VI-NEXT:    s_mov_b32 s5, s1
3465; VI-NEXT:    s_waitcnt vmcnt(1)
3466; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
3467; VI-NEXT:    s_waitcnt vmcnt(0)
3468; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
3469; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v1, v0
3470; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
3471; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v3, v2
3472; VI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
3473; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3474; VI-NEXT:    s_endpgm
3475;
3476; GFX11-LABEL: fcmp_v2f16_nlt:
3477; GFX11:       ; %bb.0: ; %entry
3478; GFX11-NEXT:    s_clause 0x1
3479; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3480; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
3481; GFX11-NEXT:    s_mov_b32 s10, -1
3482; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
3483; GFX11-NEXT:    s_mov_b32 s6, s10
3484; GFX11-NEXT:    s_mov_b32 s7, s11
3485; GFX11-NEXT:    s_mov_b32 s14, s10
3486; GFX11-NEXT:    s_mov_b32 s15, s11
3487; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3488; GFX11-NEXT:    s_mov_b32 s12, s2
3489; GFX11-NEXT:    s_mov_b32 s13, s3
3490; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
3491; GFX11-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
3492; GFX11-NEXT:    s_mov_b32 s8, s0
3493; GFX11-NEXT:    s_mov_b32 s9, s1
3494; GFX11-NEXT:    s_waitcnt vmcnt(1)
3495; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
3496; GFX11-NEXT:    s_waitcnt vmcnt(0)
3497; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
3498; GFX11-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v0
3499; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3500; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
3501; GFX11-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v3, v2
3502; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3503; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
3504; GFX11-NEXT:    s_endpgm
3505;
3506; GFX12-LABEL: fcmp_v2f16_nlt:
3507; GFX12:       ; %bb.0: ; %entry
3508; GFX12-NEXT:    s_clause 0x1
3509; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3510; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
3511; GFX12-NEXT:    s_mov_b32 s10, -1
3512; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
3513; GFX12-NEXT:    s_mov_b32 s6, s10
3514; GFX12-NEXT:    s_mov_b32 s7, s11
3515; GFX12-NEXT:    s_mov_b32 s14, s10
3516; GFX12-NEXT:    s_mov_b32 s15, s11
3517; GFX12-NEXT:    s_wait_kmcnt 0x0
3518; GFX12-NEXT:    s_mov_b32 s12, s2
3519; GFX12-NEXT:    s_mov_b32 s13, s3
3520; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
3521; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
3522; GFX12-NEXT:    s_mov_b32 s8, s0
3523; GFX12-NEXT:    s_mov_b32 s9, s1
3524; GFX12-NEXT:    s_wait_loadcnt 0x1
3525; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
3526; GFX12-NEXT:    s_wait_loadcnt 0x0
3527; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
3528; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v0
3529; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3530; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
3531; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v3, v2
3532; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3533; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
3534; GFX12-NEXT:    s_endpgm
3535    ptr addrspace(1) %r,
3536    ptr addrspace(1) %a,
3537    ptr addrspace(1) %b) {
3538entry:
3539  %a.val = load <2 x half>, ptr addrspace(1) %a
3540  %b.val = load <2 x half>, ptr addrspace(1) %b
3541  %r.val = fcmp uge <2 x half> %a.val, %b.val
3542  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
3543  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
3544  ret void
3545}
3546
3547declare half @llvm.fabs.f16(half) #1
3548
3549attributes #0 = { nounwind }
3550attributes #1 = { nounwind readnone }
3551