xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX10 %s
6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11 %s
7
8declare half @llvm.minnum.f16(half %a, half %b)
9declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
10declare <3 x half> @llvm.minnum.v3f16(<3 x half> %a, <3 x half> %b)
11declare <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b)
12
13define amdgpu_kernel void @minnum_f16_ieee(
14; SI-LABEL: minnum_f16_ieee:
15; SI:       ; %bb.0: ; %entry
16; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
17; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
18; SI-NEXT:    s_mov_b32 s7, 0xf000
19; SI-NEXT:    s_mov_b32 s6, -1
20; SI-NEXT:    s_mov_b32 s14, s6
21; SI-NEXT:    s_waitcnt lgkmcnt(0)
22; SI-NEXT:    s_mov_b32 s12, s2
23; SI-NEXT:    s_mov_b32 s13, s3
24; SI-NEXT:    s_mov_b32 s15, s7
25; SI-NEXT:    s_mov_b32 s10, s6
26; SI-NEXT:    s_mov_b32 s11, s7
27; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
28; SI-NEXT:    s_waitcnt vmcnt(0)
29; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
30; SI-NEXT:    s_waitcnt vmcnt(0)
31; SI-NEXT:    s_mov_b32 s4, s0
32; SI-NEXT:    s_mov_b32 s5, s1
33; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
34; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
35; SI-NEXT:    v_min_f32_e32 v0, v0, v1
36; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
37; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
38; SI-NEXT:    s_endpgm
39;
40; VI-LABEL: minnum_f16_ieee:
41; VI:       ; %bb.0: ; %entry
42; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
43; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
44; VI-NEXT:    s_mov_b32 s7, 0xf000
45; VI-NEXT:    s_mov_b32 s6, -1
46; VI-NEXT:    s_mov_b32 s14, s6
47; VI-NEXT:    s_waitcnt lgkmcnt(0)
48; VI-NEXT:    s_mov_b32 s12, s2
49; VI-NEXT:    s_mov_b32 s13, s3
50; VI-NEXT:    s_mov_b32 s15, s7
51; VI-NEXT:    s_mov_b32 s10, s6
52; VI-NEXT:    s_mov_b32 s11, s7
53; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
54; VI-NEXT:    s_waitcnt vmcnt(0)
55; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
56; VI-NEXT:    s_waitcnt vmcnt(0)
57; VI-NEXT:    s_mov_b32 s4, s0
58; VI-NEXT:    s_mov_b32 s5, s1
59; VI-NEXT:    v_max_f16_e32 v0, v0, v0
60; VI-NEXT:    v_max_f16_e32 v1, v1, v1
61; VI-NEXT:    v_min_f16_e32 v0, v0, v1
62; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
63; VI-NEXT:    s_endpgm
64;
65; GFX9-LABEL: minnum_f16_ieee:
66; GFX9:       ; %bb.0: ; %entry
67; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
68; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
69; GFX9-NEXT:    s_mov_b32 s7, 0xf000
70; GFX9-NEXT:    s_mov_b32 s6, -1
71; GFX9-NEXT:    s_mov_b32 s14, s6
72; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
73; GFX9-NEXT:    s_mov_b32 s12, s2
74; GFX9-NEXT:    s_mov_b32 s13, s3
75; GFX9-NEXT:    s_mov_b32 s15, s7
76; GFX9-NEXT:    s_mov_b32 s10, s6
77; GFX9-NEXT:    s_mov_b32 s11, s7
78; GFX9-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
79; GFX9-NEXT:    s_waitcnt vmcnt(0)
80; GFX9-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
81; GFX9-NEXT:    s_waitcnt vmcnt(0)
82; GFX9-NEXT:    s_mov_b32 s4, s0
83; GFX9-NEXT:    s_mov_b32 s5, s1
84; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
85; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
86; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
87; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
88; GFX9-NEXT:    s_endpgm
89;
90; GFX10-LABEL: minnum_f16_ieee:
91; GFX10:       ; %bb.0: ; %entry
92; GFX10-NEXT:    s_clause 0x1
93; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
94; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
95; GFX10-NEXT:    s_mov_b32 s6, -1
96; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
97; GFX10-NEXT:    s_mov_b32 s14, s6
98; GFX10-NEXT:    s_mov_b32 s15, s7
99; GFX10-NEXT:    s_mov_b32 s10, s6
100; GFX10-NEXT:    s_mov_b32 s11, s7
101; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
102; GFX10-NEXT:    s_mov_b32 s12, s2
103; GFX10-NEXT:    s_mov_b32 s13, s3
104; GFX10-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc dlc
105; GFX10-NEXT:    s_waitcnt vmcnt(0)
106; GFX10-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc dlc
107; GFX10-NEXT:    s_waitcnt vmcnt(0)
108; GFX10-NEXT:    s_mov_b32 s4, s0
109; GFX10-NEXT:    s_mov_b32 s5, s1
110; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
111; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
112; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
113; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
114; GFX10-NEXT:    s_endpgm
115;
116; GFX11-LABEL: minnum_f16_ieee:
117; GFX11:       ; %bb.0: ; %entry
118; GFX11-NEXT:    s_clause 0x1
119; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
120; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
121; GFX11-NEXT:    s_mov_b32 s10, -1
122; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
123; GFX11-NEXT:    s_mov_b32 s14, s10
124; GFX11-NEXT:    s_mov_b32 s15, s11
125; GFX11-NEXT:    s_mov_b32 s6, s10
126; GFX11-NEXT:    s_mov_b32 s7, s11
127; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
128; GFX11-NEXT:    s_mov_b32 s12, s2
129; GFX11-NEXT:    s_mov_b32 s13, s3
130; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
131; GFX11-NEXT:    s_waitcnt vmcnt(0)
132; GFX11-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
133; GFX11-NEXT:    s_waitcnt vmcnt(0)
134; GFX11-NEXT:    s_mov_b32 s8, s0
135; GFX11-NEXT:    s_mov_b32 s9, s1
136; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
137; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
138; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
139; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
140; GFX11-NEXT:    s_endpgm
141    ptr addrspace(1) %r,
142    ptr addrspace(1) %a,
143    ptr addrspace(1) %b) #0 {
144entry:
145  %a.val = load volatile half, ptr addrspace(1) %a
146  %b.val = load volatile half, ptr addrspace(1) %b
147  %r.val = call half @llvm.minnum.f16(half %a.val, half %b.val)
148  store half %r.val, ptr addrspace(1) %r
149  ret void
150}
151
152define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) #0 {
153; SI-LABEL: minnum_f16_no_ieee:
154; SI:       ; %bb.0:
155; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
156; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
157; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
158; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
159; SI-NEXT:    v_min_f32_e32 v0, v0, v1
160; SI-NEXT:    ; return to shader part epilog
161;
162; VI-LABEL: minnum_f16_no_ieee:
163; VI:       ; %bb.0:
164; VI-NEXT:    v_min_f16_e32 v0, v0, v1
165; VI-NEXT:    ; return to shader part epilog
166;
167; GFX9-LABEL: minnum_f16_no_ieee:
168; GFX9:       ; %bb.0:
169; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
170; GFX9-NEXT:    ; return to shader part epilog
171;
172; GFX10PLUS-LABEL: minnum_f16_no_ieee:
173; GFX10PLUS:       ; %bb.0:
174; GFX10PLUS-NEXT:    v_min_f16_e32 v0, v0, v1
175; GFX10PLUS-NEXT:    ; return to shader part epilog
176  %r.val = call half @llvm.minnum.f16(half %a, half %b)
177  ret half %r.val
178}
179
180define amdgpu_kernel void @minnum_f16_imm_a(
181; SI-LABEL: minnum_f16_imm_a:
182; SI:       ; %bb.0: ; %entry
183; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
184; SI-NEXT:    s_mov_b32 s7, 0xf000
185; SI-NEXT:    s_mov_b32 s6, -1
186; SI-NEXT:    s_mov_b32 s10, s6
187; SI-NEXT:    s_mov_b32 s11, s7
188; SI-NEXT:    s_waitcnt lgkmcnt(0)
189; SI-NEXT:    s_mov_b32 s8, s2
190; SI-NEXT:    s_mov_b32 s9, s3
191; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
192; SI-NEXT:    s_mov_b32 s4, s0
193; SI-NEXT:    s_mov_b32 s5, s1
194; SI-NEXT:    s_waitcnt vmcnt(0)
195; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
196; SI-NEXT:    v_min_f32_e32 v0, 0x40400000, v0
197; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
198; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
199; SI-NEXT:    s_endpgm
200;
201; VI-LABEL: minnum_f16_imm_a:
202; VI:       ; %bb.0: ; %entry
203; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
204; VI-NEXT:    s_mov_b32 s7, 0xf000
205; VI-NEXT:    s_mov_b32 s6, -1
206; VI-NEXT:    s_mov_b32 s10, s6
207; VI-NEXT:    s_mov_b32 s11, s7
208; VI-NEXT:    s_waitcnt lgkmcnt(0)
209; VI-NEXT:    s_mov_b32 s8, s2
210; VI-NEXT:    s_mov_b32 s9, s3
211; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
212; VI-NEXT:    s_mov_b32 s4, s0
213; VI-NEXT:    s_mov_b32 s5, s1
214; VI-NEXT:    s_waitcnt vmcnt(0)
215; VI-NEXT:    v_max_f16_e32 v0, v0, v0
216; VI-NEXT:    v_min_f16_e32 v0, 0x4200, v0
217; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
218; VI-NEXT:    s_endpgm
219;
220; GFX9-LABEL: minnum_f16_imm_a:
221; GFX9:       ; %bb.0: ; %entry
222; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
223; GFX9-NEXT:    s_mov_b32 s7, 0xf000
224; GFX9-NEXT:    s_mov_b32 s6, -1
225; GFX9-NEXT:    s_mov_b32 s10, s6
226; GFX9-NEXT:    s_mov_b32 s11, s7
227; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
228; GFX9-NEXT:    s_mov_b32 s8, s2
229; GFX9-NEXT:    s_mov_b32 s9, s3
230; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
231; GFX9-NEXT:    s_mov_b32 s4, s0
232; GFX9-NEXT:    s_mov_b32 s5, s1
233; GFX9-NEXT:    s_waitcnt vmcnt(0)
234; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
235; GFX9-NEXT:    v_min_f16_e32 v0, 0x4200, v0
236; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
237; GFX9-NEXT:    s_endpgm
238;
239; GFX10-LABEL: minnum_f16_imm_a:
240; GFX10:       ; %bb.0: ; %entry
241; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
242; GFX10-NEXT:    s_mov_b32 s6, -1
243; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
244; GFX10-NEXT:    s_mov_b32 s10, s6
245; GFX10-NEXT:    s_mov_b32 s11, s7
246; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
247; GFX10-NEXT:    s_mov_b32 s8, s2
248; GFX10-NEXT:    s_mov_b32 s9, s3
249; GFX10-NEXT:    s_mov_b32 s4, s0
250; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
251; GFX10-NEXT:    s_mov_b32 s5, s1
252; GFX10-NEXT:    s_waitcnt vmcnt(0)
253; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
254; GFX10-NEXT:    v_min_f16_e32 v0, 0x4200, v0
255; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
256; GFX10-NEXT:    s_endpgm
257;
258; GFX11-LABEL: minnum_f16_imm_a:
259; GFX11:       ; %bb.0: ; %entry
260; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
261; GFX11-NEXT:    s_mov_b32 s6, -1
262; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
263; GFX11-NEXT:    s_mov_b32 s10, s6
264; GFX11-NEXT:    s_mov_b32 s11, s7
265; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
266; GFX11-NEXT:    s_mov_b32 s8, s2
267; GFX11-NEXT:    s_mov_b32 s9, s3
268; GFX11-NEXT:    s_mov_b32 s4, s0
269; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
270; GFX11-NEXT:    s_mov_b32 s5, s1
271; GFX11-NEXT:    s_waitcnt vmcnt(0)
272; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
273; GFX11-NEXT:    v_min_f16_e32 v0, 0x4200, v0
274; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
275; GFX11-NEXT:    s_endpgm
276    ptr addrspace(1) %r,
277    ptr addrspace(1) %b) #0 {
278entry:
279  %b.val = load half, ptr addrspace(1) %b
280  %r.val = call half @llvm.minnum.f16(half 3.0, half %b.val)
281  store half %r.val, ptr addrspace(1) %r
282  ret void
283}
284
285define amdgpu_kernel void @minnum_f16_imm_b(
286; SI-LABEL: minnum_f16_imm_b:
287; SI:       ; %bb.0: ; %entry
288; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
289; SI-NEXT:    s_mov_b32 s7, 0xf000
290; SI-NEXT:    s_mov_b32 s6, -1
291; SI-NEXT:    s_mov_b32 s10, s6
292; SI-NEXT:    s_mov_b32 s11, s7
293; SI-NEXT:    s_waitcnt lgkmcnt(0)
294; SI-NEXT:    s_mov_b32 s8, s2
295; SI-NEXT:    s_mov_b32 s9, s3
296; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
297; SI-NEXT:    s_mov_b32 s4, s0
298; SI-NEXT:    s_mov_b32 s5, s1
299; SI-NEXT:    s_waitcnt vmcnt(0)
300; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
301; SI-NEXT:    v_min_f32_e32 v0, 4.0, v0
302; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
303; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
304; SI-NEXT:    s_endpgm
305;
306; VI-LABEL: minnum_f16_imm_b:
307; VI:       ; %bb.0: ; %entry
308; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
309; VI-NEXT:    s_mov_b32 s7, 0xf000
310; VI-NEXT:    s_mov_b32 s6, -1
311; VI-NEXT:    s_mov_b32 s10, s6
312; VI-NEXT:    s_mov_b32 s11, s7
313; VI-NEXT:    s_waitcnt lgkmcnt(0)
314; VI-NEXT:    s_mov_b32 s8, s2
315; VI-NEXT:    s_mov_b32 s9, s3
316; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
317; VI-NEXT:    s_mov_b32 s4, s0
318; VI-NEXT:    s_mov_b32 s5, s1
319; VI-NEXT:    s_waitcnt vmcnt(0)
320; VI-NEXT:    v_max_f16_e32 v0, v0, v0
321; VI-NEXT:    v_min_f16_e32 v0, 4.0, v0
322; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
323; VI-NEXT:    s_endpgm
324;
325; GFX9-LABEL: minnum_f16_imm_b:
326; GFX9:       ; %bb.0: ; %entry
327; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
328; GFX9-NEXT:    s_mov_b32 s7, 0xf000
329; GFX9-NEXT:    s_mov_b32 s6, -1
330; GFX9-NEXT:    s_mov_b32 s10, s6
331; GFX9-NEXT:    s_mov_b32 s11, s7
332; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
333; GFX9-NEXT:    s_mov_b32 s8, s2
334; GFX9-NEXT:    s_mov_b32 s9, s3
335; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
336; GFX9-NEXT:    s_mov_b32 s4, s0
337; GFX9-NEXT:    s_mov_b32 s5, s1
338; GFX9-NEXT:    s_waitcnt vmcnt(0)
339; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
340; GFX9-NEXT:    v_min_f16_e32 v0, 4.0, v0
341; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
342; GFX9-NEXT:    s_endpgm
343;
344; GFX10-LABEL: minnum_f16_imm_b:
345; GFX10:       ; %bb.0: ; %entry
346; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
347; GFX10-NEXT:    s_mov_b32 s6, -1
348; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
349; GFX10-NEXT:    s_mov_b32 s10, s6
350; GFX10-NEXT:    s_mov_b32 s11, s7
351; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
352; GFX10-NEXT:    s_mov_b32 s8, s2
353; GFX10-NEXT:    s_mov_b32 s9, s3
354; GFX10-NEXT:    s_mov_b32 s4, s0
355; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
356; GFX10-NEXT:    s_mov_b32 s5, s1
357; GFX10-NEXT:    s_waitcnt vmcnt(0)
358; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
359; GFX10-NEXT:    v_min_f16_e32 v0, 4.0, v0
360; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
361; GFX10-NEXT:    s_endpgm
362;
363; GFX11-LABEL: minnum_f16_imm_b:
364; GFX11:       ; %bb.0: ; %entry
365; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
366; GFX11-NEXT:    s_mov_b32 s6, -1
367; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
368; GFX11-NEXT:    s_mov_b32 s10, s6
369; GFX11-NEXT:    s_mov_b32 s11, s7
370; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
371; GFX11-NEXT:    s_mov_b32 s8, s2
372; GFX11-NEXT:    s_mov_b32 s9, s3
373; GFX11-NEXT:    s_mov_b32 s4, s0
374; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
375; GFX11-NEXT:    s_mov_b32 s5, s1
376; GFX11-NEXT:    s_waitcnt vmcnt(0)
377; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
378; GFX11-NEXT:    v_min_f16_e32 v0, 4.0, v0
379; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
380; GFX11-NEXT:    s_endpgm
381    ptr addrspace(1) %r,
382    ptr addrspace(1) %a) #0 {
383entry:
384  %a.val = load half, ptr addrspace(1) %a
385  %r.val = call half @llvm.minnum.f16(half %a.val, half 4.0)
386  store half %r.val, ptr addrspace(1) %r
387  ret void
388}
389
390define amdgpu_kernel void @minnum_v2f16_ieee(
391; SI-LABEL: minnum_v2f16_ieee:
392; SI:       ; %bb.0: ; %entry
393; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
394; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
395; SI-NEXT:    s_waitcnt lgkmcnt(0)
396; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
397; SI-NEXT:    s_load_dword s4, s[4:5], 0x0
398; SI-NEXT:    s_mov_b32 s3, 0xf000
399; SI-NEXT:    s_waitcnt lgkmcnt(0)
400; SI-NEXT:    s_lshr_b32 s5, s2, 16
401; SI-NEXT:    s_lshr_b32 s6, s4, 16
402; SI-NEXT:    v_cvt_f32_f16_e32 v0, s5
403; SI-NEXT:    v_cvt_f32_f16_e32 v1, s6
404; SI-NEXT:    v_cvt_f32_f16_e32 v2, s2
405; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
406; SI-NEXT:    s_mov_b32 s2, -1
407; SI-NEXT:    v_min_f32_e32 v0, v0, v1
408; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
409; SI-NEXT:    v_min_f32_e32 v1, v2, v3
410; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
411; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
412; SI-NEXT:    v_or_b32_e32 v0, v1, v0
413; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
414; SI-NEXT:    s_endpgm
415;
416; VI-LABEL: minnum_v2f16_ieee:
417; VI:       ; %bb.0: ; %entry
418; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
419; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
420; VI-NEXT:    s_mov_b32 s7, 0xf000
421; VI-NEXT:    s_mov_b32 s6, -1
422; VI-NEXT:    s_waitcnt lgkmcnt(0)
423; VI-NEXT:    s_load_dword s8, s[8:9], 0x0
424; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
425; VI-NEXT:    s_mov_b32 s4, s0
426; VI-NEXT:    s_mov_b32 s5, s1
427; VI-NEXT:    s_waitcnt lgkmcnt(0)
428; VI-NEXT:    v_max_f16_e64 v0, s8, s8
429; VI-NEXT:    v_max_f16_e64 v1, s2, s2
430; VI-NEXT:    s_lshr_b32 s0, s8, 16
431; VI-NEXT:    v_min_f16_e32 v0, v1, v0
432; VI-NEXT:    v_max_f16_e64 v1, s0, s0
433; VI-NEXT:    s_lshr_b32 s0, s2, 16
434; VI-NEXT:    v_max_f16_e64 v2, s0, s0
435; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
436; VI-NEXT:    v_or_b32_e32 v0, v0, v1
437; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
438; VI-NEXT:    s_endpgm
439;
440; GFX9-LABEL: minnum_v2f16_ieee:
441; GFX9:       ; %bb.0: ; %entry
442; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
443; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
444; GFX9-NEXT:    s_mov_b32 s7, 0xf000
445; GFX9-NEXT:    s_mov_b32 s6, -1
446; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
447; GFX9-NEXT:    s_load_dword s10, s[8:9], 0x0
448; GFX9-NEXT:    s_load_dword s11, s[2:3], 0x0
449; GFX9-NEXT:    s_mov_b32 s4, s0
450; GFX9-NEXT:    s_mov_b32 s5, s1
451; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
452; GFX9-NEXT:    v_pk_max_f16 v0, s10, s10
453; GFX9-NEXT:    v_pk_max_f16 v1, s11, s11
454; GFX9-NEXT:    v_pk_min_f16 v0, v1, v0
455; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
456; GFX9-NEXT:    s_endpgm
457;
458; GFX10-LABEL: minnum_v2f16_ieee:
459; GFX10:       ; %bb.0: ; %entry
460; GFX10-NEXT:    s_clause 0x1
461; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
462; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
463; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
464; GFX10-NEXT:    s_load_dword s4, s[6:7], 0x0
465; GFX10-NEXT:    s_load_dword s5, s[2:3], 0x0
466; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
467; GFX10-NEXT:    s_mov_b32 s2, -1
468; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
469; GFX10-NEXT:    v_pk_max_f16 v0, s4, s4
470; GFX10-NEXT:    v_pk_max_f16 v1, s5, s5
471; GFX10-NEXT:    v_pk_min_f16 v0, v1, v0
472; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
473; GFX10-NEXT:    s_endpgm
474;
475; GFX11-LABEL: minnum_v2f16_ieee:
476; GFX11:       ; %bb.0: ; %entry
477; GFX11-NEXT:    s_clause 0x1
478; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
479; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
480; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
481; GFX11-NEXT:    s_load_b32 s4, s[6:7], 0x0
482; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
483; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
484; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
485; GFX11-NEXT:    v_pk_max_f16 v0, s4, s4
486; GFX11-NEXT:    v_pk_max_f16 v1, s2, s2
487; GFX11-NEXT:    s_mov_b32 s2, -1
488; GFX11-NEXT:    v_pk_min_f16 v0, v1, v0
489; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
490; GFX11-NEXT:    s_endpgm
491    ptr addrspace(1) %r,
492    ptr addrspace(1) %a,
493    ptr addrspace(1) %b) #0 {
494entry:
495  %a.val = load <2 x half>, ptr addrspace(1) %a
496  %b.val = load <2 x half>, ptr addrspace(1) %b
497  %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> %b.val)
498  store <2 x half> %r.val, ptr addrspace(1) %r
499  ret void
500}
501
502define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) #0 {
503; SI-LABEL: minnum_v2f16_no_ieee:
504; SI:       ; %bb.0:
505; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
506; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
507; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
508; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
509; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
510; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
511; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
512; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
513; SI-NEXT:    v_min_f32_e32 v0, v0, v2
514; SI-NEXT:    v_min_f32_e32 v1, v1, v3
515; SI-NEXT:    ; return to shader part epilog
516;
517; VI-LABEL: minnum_v2f16_no_ieee:
518; VI:       ; %bb.0:
519; VI-NEXT:    v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
520; VI-NEXT:    v_min_f16_e32 v0, v0, v1
521; VI-NEXT:    v_or_b32_e32 v0, v0, v2
522; VI-NEXT:    ; return to shader part epilog
523;
524; GFX9-LABEL: minnum_v2f16_no_ieee:
525; GFX9:       ; %bb.0:
526; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
527; GFX9-NEXT:    ; return to shader part epilog
528;
529; GFX10PLUS-LABEL: minnum_v2f16_no_ieee:
530; GFX10PLUS:       ; %bb.0:
531; GFX10PLUS-NEXT:    v_pk_min_f16 v0, v0, v1
532; GFX10PLUS-NEXT:    ; return to shader part epilog
533  %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
534  ret <2 x half> %r.val
535}
536
537define amdgpu_kernel void @minnum_v2f16_imm_a(
538; SI-LABEL: minnum_v2f16_imm_a:
539; SI:       ; %bb.0: ; %entry
540; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
541; SI-NEXT:    s_waitcnt lgkmcnt(0)
542; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
543; SI-NEXT:    s_waitcnt lgkmcnt(0)
544; SI-NEXT:    s_lshr_b32 s3, s2, 16
545; SI-NEXT:    v_cvt_f32_f16_e32 v0, s3
546; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
547; SI-NEXT:    s_mov_b32 s3, 0xf000
548; SI-NEXT:    s_mov_b32 s2, -1
549; SI-NEXT:    v_min_f32_e32 v0, 4.0, v0
550; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
551; SI-NEXT:    v_min_f32_e32 v1, 0x40400000, v1
552; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
553; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
554; SI-NEXT:    v_or_b32_e32 v0, v1, v0
555; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
556; SI-NEXT:    s_endpgm
557;
558; VI-LABEL: minnum_v2f16_imm_a:
559; VI:       ; %bb.0: ; %entry
560; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
561; VI-NEXT:    v_mov_b32_e32 v2, 0x4400
562; VI-NEXT:    s_waitcnt lgkmcnt(0)
563; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
564; VI-NEXT:    s_mov_b32 s3, 0xf000
565; VI-NEXT:    s_mov_b32 s2, -1
566; VI-NEXT:    s_waitcnt lgkmcnt(0)
567; VI-NEXT:    v_max_f16_e64 v0, s4, s4
568; VI-NEXT:    s_lshr_b32 s4, s4, 16
569; VI-NEXT:    v_max_f16_e64 v1, s4, s4
570; VI-NEXT:    v_min_f16_e32 v0, 0x4200, v0
571; VI-NEXT:    v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
572; VI-NEXT:    v_or_b32_e32 v0, v0, v1
573; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
574; VI-NEXT:    s_endpgm
575;
576; GFX9-LABEL: minnum_v2f16_imm_a:
577; GFX9:       ; %bb.0: ; %entry
578; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
579; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
580; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x0
581; GFX9-NEXT:    s_mov_b32 s3, 0xf000
582; GFX9-NEXT:    s_mov_b32 s2, -1
583; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
584; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
585; GFX9-NEXT:    s_mov_b32 s4, 0x44004200
586; GFX9-NEXT:    v_pk_min_f16 v0, v0, s4
587; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
588; GFX9-NEXT:    s_endpgm
589;
590; GFX10-LABEL: minnum_v2f16_imm_a:
591; GFX10:       ; %bb.0: ; %entry
592; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
593; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
594; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
595; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
596; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
597; GFX10-NEXT:    v_pk_max_f16 v0, s2, s2
598; GFX10-NEXT:    s_mov_b32 s2, -1
599; GFX10-NEXT:    v_pk_min_f16 v0, 0x44004200, v0
600; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
601; GFX10-NEXT:    s_endpgm
602;
603; GFX11-LABEL: minnum_v2f16_imm_a:
604; GFX11:       ; %bb.0: ; %entry
605; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
606; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
607; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
608; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
609; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
610; GFX11-NEXT:    v_pk_max_f16 v0, s2, s2
611; GFX11-NEXT:    s_mov_b32 s2, -1
612; GFX11-NEXT:    v_pk_min_f16 v0, 0x44004200, v0
613; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
614; GFX11-NEXT:    s_endpgm
615    ptr addrspace(1) %r,
616    ptr addrspace(1) %b) #0 {
617entry:
618  %b.val = load <2 x half>, ptr addrspace(1) %b
619  %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val)
620  store <2 x half> %r.val, ptr addrspace(1) %r
621  ret void
622}
623
624define amdgpu_kernel void @minnum_v2f16_imm_b(
625; SI-LABEL: minnum_v2f16_imm_b:
626; SI:       ; %bb.0: ; %entry
627; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
628; SI-NEXT:    s_waitcnt lgkmcnt(0)
629; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
630; SI-NEXT:    s_waitcnt lgkmcnt(0)
631; SI-NEXT:    s_lshr_b32 s3, s2, 16
632; SI-NEXT:    v_cvt_f32_f16_e32 v0, s3
633; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
634; SI-NEXT:    s_mov_b32 s3, 0xf000
635; SI-NEXT:    s_mov_b32 s2, -1
636; SI-NEXT:    v_min_f32_e32 v0, 0x40400000, v0
637; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
638; SI-NEXT:    v_min_f32_e32 v1, 4.0, v1
639; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
640; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
641; SI-NEXT:    v_or_b32_e32 v0, v1, v0
642; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
643; SI-NEXT:    s_endpgm
644;
645; VI-LABEL: minnum_v2f16_imm_b:
646; VI:       ; %bb.0: ; %entry
647; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
648; VI-NEXT:    v_mov_b32_e32 v2, 0x4200
649; VI-NEXT:    s_waitcnt lgkmcnt(0)
650; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
651; VI-NEXT:    s_mov_b32 s3, 0xf000
652; VI-NEXT:    s_mov_b32 s2, -1
653; VI-NEXT:    s_waitcnt lgkmcnt(0)
654; VI-NEXT:    v_max_f16_e64 v0, s4, s4
655; VI-NEXT:    s_lshr_b32 s4, s4, 16
656; VI-NEXT:    v_max_f16_e64 v1, s4, s4
657; VI-NEXT:    v_min_f16_e32 v0, 4.0, v0
658; VI-NEXT:    v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
659; VI-NEXT:    v_or_b32_e32 v0, v0, v1
660; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
661; VI-NEXT:    s_endpgm
662;
663; GFX9-LABEL: minnum_v2f16_imm_b:
664; GFX9:       ; %bb.0: ; %entry
665; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
666; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
667; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x0
668; GFX9-NEXT:    s_mov_b32 s3, 0xf000
669; GFX9-NEXT:    s_mov_b32 s2, -1
670; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
671; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
672; GFX9-NEXT:    s_mov_b32 s4, 0x42004400
673; GFX9-NEXT:    v_pk_min_f16 v0, v0, s4
674; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
675; GFX9-NEXT:    s_endpgm
676;
677; GFX10-LABEL: minnum_v2f16_imm_b:
678; GFX10:       ; %bb.0: ; %entry
679; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
680; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
681; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
682; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
683; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
684; GFX10-NEXT:    v_pk_max_f16 v0, s2, s2
685; GFX10-NEXT:    s_mov_b32 s2, -1
686; GFX10-NEXT:    v_pk_min_f16 v0, 0x42004400, v0
687; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
688; GFX10-NEXT:    s_endpgm
689;
690; GFX11-LABEL: minnum_v2f16_imm_b:
691; GFX11:       ; %bb.0: ; %entry
692; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
693; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
694; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
695; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
696; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
697; GFX11-NEXT:    v_pk_max_f16 v0, s2, s2
698; GFX11-NEXT:    s_mov_b32 s2, -1
699; GFX11-NEXT:    v_pk_min_f16 v0, 0x42004400, v0
700; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
701; GFX11-NEXT:    s_endpgm
702    ptr addrspace(1) %r,
703    ptr addrspace(1) %a) #0 {
704entry:
705  %a.val = load <2 x half>, ptr addrspace(1) %a
706  %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>)
707  store <2 x half> %r.val, ptr addrspace(1) %r
708  ret void
709}
710
711; FIXME: Scalarize with undef half
712define amdgpu_kernel void @minnum_v3f16(
713; SI-LABEL: minnum_v3f16:
714; SI:       ; %bb.0: ; %entry
715; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
716; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
717; SI-NEXT:    s_waitcnt lgkmcnt(0)
718; SI-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x0
719; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
720; SI-NEXT:    s_mov_b32 s3, 0xf000
721; SI-NEXT:    s_mov_b32 s2, -1
722; SI-NEXT:    s_waitcnt lgkmcnt(0)
723; SI-NEXT:    v_cvt_f32_f16_e32 v0, s7
724; SI-NEXT:    s_lshr_b32 s7, s6, 16
725; SI-NEXT:    s_lshr_b32 s8, s4, 16
726; SI-NEXT:    v_cvt_f32_f16_e32 v1, s7
727; SI-NEXT:    v_cvt_f32_f16_e32 v2, s8
728; SI-NEXT:    v_cvt_f32_f16_e32 v3, s6
729; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
730; SI-NEXT:    v_cvt_f32_f16_e32 v5, s5
731; SI-NEXT:    v_min_f32_e32 v1, v1, v2
732; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
733; SI-NEXT:    v_min_f32_e32 v2, v3, v4
734; SI-NEXT:    v_min_f32_e32 v0, v0, v5
735; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
736; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
737; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
738; SI-NEXT:    v_or_b32_e32 v1, v2, v1
739; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
740; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
741; SI-NEXT:    s_endpgm
742;
743; VI-LABEL: minnum_v3f16:
744; VI:       ; %bb.0: ; %entry
745; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
746; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
747; VI-NEXT:    s_mov_b32 s7, 0xf000
748; VI-NEXT:    s_mov_b32 s6, -1
749; VI-NEXT:    s_waitcnt lgkmcnt(0)
750; VI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
751; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
752; VI-NEXT:    s_mov_b32 s4, s0
753; VI-NEXT:    s_mov_b32 s5, s1
754; VI-NEXT:    s_waitcnt lgkmcnt(0)
755; VI-NEXT:    v_max_f16_e64 v0, s8, s8
756; VI-NEXT:    v_max_f16_e64 v1, s2, s2
757; VI-NEXT:    s_lshr_b32 s0, s8, 16
758; VI-NEXT:    v_min_f16_e32 v0, v1, v0
759; VI-NEXT:    v_max_f16_e64 v1, s0, s0
760; VI-NEXT:    s_lshr_b32 s0, s2, 16
761; VI-NEXT:    v_max_f16_e64 v2, s0, s0
762; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
763; VI-NEXT:    v_or_b32_e32 v0, v0, v1
764; VI-NEXT:    v_max_f16_e64 v1, s9, s9
765; VI-NEXT:    v_max_f16_e64 v2, s3, s3
766; VI-NEXT:    v_min_f16_e32 v1, v2, v1
767; VI-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
768; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
769; VI-NEXT:    s_endpgm
770;
771; GFX9-LABEL: minnum_v3f16:
772; GFX9:       ; %bb.0: ; %entry
773; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
774; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
775; GFX9-NEXT:    s_mov_b32 s7, 0xf000
776; GFX9-NEXT:    s_mov_b32 s6, -1
777; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
778; GFX9-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
779; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0x0
780; GFX9-NEXT:    s_mov_b32 s4, s0
781; GFX9-NEXT:    s_mov_b32 s5, s1
782; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
783; GFX9-NEXT:    v_pk_max_f16 v0, s10, s10
784; GFX9-NEXT:    v_pk_max_f16 v1, s12, s12
785; GFX9-NEXT:    v_pk_max_f16 v2, s11, s11
786; GFX9-NEXT:    v_pk_min_f16 v0, v1, v0
787; GFX9-NEXT:    v_pk_max_f16 v1, s13, s13
788; GFX9-NEXT:    v_pk_min_f16 v1, v1, v2
789; GFX9-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
790; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
791; GFX9-NEXT:    s_endpgm
792;
793; GFX10-LABEL: minnum_v3f16:
794; GFX10:       ; %bb.0: ; %entry
795; GFX10-NEXT:    s_clause 0x1
796; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
797; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
798; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
799; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
800; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x0
801; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
802; GFX10-NEXT:    s_mov_b32 s2, -1
803; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
804; GFX10-NEXT:    v_pk_max_f16 v1, s5, s5
805; GFX10-NEXT:    v_pk_max_f16 v2, s9, s9
806; GFX10-NEXT:    v_pk_max_f16 v0, s4, s4
807; GFX10-NEXT:    v_pk_max_f16 v3, s8, s8
808; GFX10-NEXT:    v_pk_min_f16 v1, v2, v1
809; GFX10-NEXT:    v_pk_min_f16 v0, v3, v0
810; GFX10-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
811; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
812; GFX10-NEXT:    s_endpgm
813;
814; GFX11-LABEL: minnum_v3f16:
815; GFX11:       ; %bb.0: ; %entry
816; GFX11-NEXT:    s_clause 0x1
817; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
818; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
819; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
820; GFX11-NEXT:    s_load_b64 s[4:5], s[6:7], 0x0
821; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
822; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
823; GFX11-NEXT:    v_pk_max_f16 v1, s5, s5
824; GFX11-NEXT:    v_pk_max_f16 v2, s3, s3
825; GFX11-NEXT:    v_pk_max_f16 v0, s4, s4
826; GFX11-NEXT:    v_pk_max_f16 v3, s2, s2
827; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
828; GFX11-NEXT:    s_mov_b32 s2, -1
829; GFX11-NEXT:    v_pk_min_f16 v1, v2, v1
830; GFX11-NEXT:    v_pk_min_f16 v0, v3, v0
831; GFX11-NEXT:    s_clause 0x1
832; GFX11-NEXT:    buffer_store_b16 v1, off, s[0:3], 0 offset:4
833; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
834; GFX11-NEXT:    s_endpgm
835    ptr addrspace(1) %r,
836    ptr addrspace(1) %a,
837    ptr addrspace(1) %b) #0 {
838entry:
839  %a.val = load <3 x half>, ptr addrspace(1) %a
840  %b.val = load <3 x half>, ptr addrspace(1) %b
841  %r.val = call <3 x half> @llvm.minnum.v3f16(<3 x half> %a.val, <3 x half> %b.val)
842  store <3 x half> %r.val, ptr addrspace(1) %r
843  ret void
844}
845
846define amdgpu_kernel void @minnum_v4f16(
847; SI-LABEL: minnum_v4f16:
848; SI:       ; %bb.0: ; %entry
849; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
850; SI-NEXT:    s_mov_b32 s3, 0xf000
851; SI-NEXT:    s_mov_b32 s2, -1
852; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
853; SI-NEXT:    s_waitcnt lgkmcnt(0)
854; SI-NEXT:    s_load_dwordx2 s[6:7], s[10:11], 0x0
855; SI-NEXT:    s_mov_b32 s0, s8
856; SI-NEXT:    s_mov_b32 s1, s9
857; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
858; SI-NEXT:    s_waitcnt lgkmcnt(0)
859; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
860; SI-NEXT:    s_lshr_b32 s6, s6, 16
861; SI-NEXT:    v_cvt_f32_f16_e32 v2, s6
862; SI-NEXT:    s_lshr_b32 s6, s7, 16
863; SI-NEXT:    v_cvt_f32_f16_e32 v3, s6
864; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
865; SI-NEXT:    s_lshr_b32 s6, s5, 16
866; SI-NEXT:    s_lshr_b32 s4, s4, 16
867; SI-NEXT:    v_cvt_f32_f16_e32 v5, s6
868; SI-NEXT:    v_cvt_f32_f16_e32 v7, s4
869; SI-NEXT:    v_cvt_f32_f16_e32 v1, s7
870; SI-NEXT:    v_cvt_f32_f16_e32 v6, s5
871; SI-NEXT:    v_min_f32_e32 v3, v3, v5
872; SI-NEXT:    v_min_f32_e32 v2, v2, v7
873; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
874; SI-NEXT:    v_min_f32_e32 v1, v1, v6
875; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
876; SI-NEXT:    v_min_f32_e32 v0, v0, v4
877; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
878; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
879; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
880; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
881; SI-NEXT:    v_or_b32_e32 v1, v1, v3
882; SI-NEXT:    v_or_b32_e32 v0, v0, v2
883; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
884; SI-NEXT:    s_endpgm
885;
886; VI-LABEL: minnum_v4f16:
887; VI:       ; %bb.0: ; %entry
888; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
889; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
890; VI-NEXT:    s_mov_b32 s7, 0xf000
891; VI-NEXT:    s_mov_b32 s6, -1
892; VI-NEXT:    s_waitcnt lgkmcnt(0)
893; VI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
894; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
895; VI-NEXT:    s_mov_b32 s4, s0
896; VI-NEXT:    s_mov_b32 s5, s1
897; VI-NEXT:    s_waitcnt lgkmcnt(0)
898; VI-NEXT:    v_max_f16_e64 v0, s9, s9
899; VI-NEXT:    v_max_f16_e64 v1, s3, s3
900; VI-NEXT:    s_lshr_b32 s0, s9, 16
901; VI-NEXT:    v_min_f16_e32 v0, v1, v0
902; VI-NEXT:    v_max_f16_e64 v1, s0, s0
903; VI-NEXT:    s_lshr_b32 s0, s3, 16
904; VI-NEXT:    v_max_f16_e64 v2, s0, s0
905; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
906; VI-NEXT:    v_or_b32_e32 v1, v0, v1
907; VI-NEXT:    v_max_f16_e64 v0, s8, s8
908; VI-NEXT:    v_max_f16_e64 v2, s2, s2
909; VI-NEXT:    s_lshr_b32 s0, s8, 16
910; VI-NEXT:    v_min_f16_e32 v0, v2, v0
911; VI-NEXT:    v_max_f16_e64 v2, s0, s0
912; VI-NEXT:    s_lshr_b32 s0, s2, 16
913; VI-NEXT:    v_max_f16_e64 v3, s0, s0
914; VI-NEXT:    v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
915; VI-NEXT:    v_or_b32_e32 v0, v0, v2
916; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
917; VI-NEXT:    s_endpgm
918;
919; GFX9-LABEL: minnum_v4f16:
920; GFX9:       ; %bb.0: ; %entry
921; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
922; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
923; GFX9-NEXT:    s_mov_b32 s7, 0xf000
924; GFX9-NEXT:    s_mov_b32 s6, -1
925; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
926; GFX9-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
927; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0x0
928; GFX9-NEXT:    s_mov_b32 s4, s0
929; GFX9-NEXT:    s_mov_b32 s5, s1
930; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
931; GFX9-NEXT:    v_pk_max_f16 v0, s11, s11
932; GFX9-NEXT:    v_pk_max_f16 v1, s13, s13
933; GFX9-NEXT:    v_pk_max_f16 v2, s10, s10
934; GFX9-NEXT:    v_pk_min_f16 v1, v1, v0
935; GFX9-NEXT:    v_pk_max_f16 v0, s12, s12
936; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
937; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
938; GFX9-NEXT:    s_endpgm
939;
940; GFX10-LABEL: minnum_v4f16:
941; GFX10:       ; %bb.0: ; %entry
942; GFX10-NEXT:    s_clause 0x1
943; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
944; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
945; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
946; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
947; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x0
948; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
949; GFX10-NEXT:    s_mov_b32 s2, -1
950; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
951; GFX10-NEXT:    v_pk_max_f16 v0, s5, s5
952; GFX10-NEXT:    v_pk_max_f16 v1, s9, s9
953; GFX10-NEXT:    v_pk_max_f16 v2, s4, s4
954; GFX10-NEXT:    v_pk_max_f16 v3, s8, s8
955; GFX10-NEXT:    v_pk_min_f16 v1, v1, v0
956; GFX10-NEXT:    v_pk_min_f16 v0, v3, v2
957; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
958; GFX10-NEXT:    s_endpgm
959;
960; GFX11-LABEL: minnum_v4f16:
961; GFX11:       ; %bb.0: ; %entry
962; GFX11-NEXT:    s_clause 0x1
963; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
964; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
965; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
966; GFX11-NEXT:    s_load_b64 s[4:5], s[6:7], 0x0
967; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
968; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
969; GFX11-NEXT:    v_pk_max_f16 v0, s5, s5
970; GFX11-NEXT:    v_pk_max_f16 v1, s3, s3
971; GFX11-NEXT:    v_pk_max_f16 v2, s4, s4
972; GFX11-NEXT:    v_pk_max_f16 v3, s2, s2
973; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
974; GFX11-NEXT:    s_mov_b32 s2, -1
975; GFX11-NEXT:    v_pk_min_f16 v1, v1, v0
976; GFX11-NEXT:    v_pk_min_f16 v0, v3, v2
977; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
978; GFX11-NEXT:    s_endpgm
979    ptr addrspace(1) %r,
980    ptr addrspace(1) %a,
981    ptr addrspace(1) %b) #0 {
982entry:
983  %a.val = load <4 x half>, ptr addrspace(1) %a
984  %b.val = load <4 x half>, ptr addrspace(1) %b
985  %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> %a.val, <4 x half> %b.val)
986  store <4 x half> %r.val, ptr addrspace(1) %r
987  ret void
988}
989
990define amdgpu_kernel void @fmin_v4f16_imm_a(
991; SI-LABEL: fmin_v4f16_imm_a:
992; SI:       ; %bb.0: ; %entry
993; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
994; SI-NEXT:    s_waitcnt lgkmcnt(0)
995; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
996; SI-NEXT:    s_mov_b32 s3, 0xf000
997; SI-NEXT:    s_mov_b32 s2, -1
998; SI-NEXT:    s_waitcnt lgkmcnt(0)
999; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
1000; SI-NEXT:    s_lshr_b32 s5, s5, 16
1001; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
1002; SI-NEXT:    s_lshr_b32 s4, s4, 16
1003; SI-NEXT:    v_cvt_f32_f16_e32 v2, s5
1004; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
1005; SI-NEXT:    v_min_f32_e32 v1, 0x40400000, v1
1006; SI-NEXT:    v_min_f32_e32 v0, 0x41000000, v0
1007; SI-NEXT:    v_min_f32_e32 v2, 4.0, v2
1008; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1009; SI-NEXT:    v_min_f32_e32 v3, 2.0, v3
1010; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1011; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1012; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1013; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1014; SI-NEXT:    v_or_b32_e32 v1, v1, v2
1015; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
1016; SI-NEXT:    v_or_b32_e32 v0, v0, v2
1017; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1018; SI-NEXT:    s_endpgm
1019;
1020; VI-LABEL: fmin_v4f16_imm_a:
1021; VI:       ; %bb.0: ; %entry
1022; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1023; VI-NEXT:    v_mov_b32_e32 v0, 0x4400
1024; VI-NEXT:    s_mov_b32 s7, 0xf000
1025; VI-NEXT:    s_mov_b32 s6, -1
1026; VI-NEXT:    s_waitcnt lgkmcnt(0)
1027; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1028; VI-NEXT:    s_mov_b32 s4, s0
1029; VI-NEXT:    s_mov_b32 s5, s1
1030; VI-NEXT:    s_waitcnt lgkmcnt(0)
1031; VI-NEXT:    s_lshr_b32 s0, s3, 16
1032; VI-NEXT:    v_max_f16_e64 v1, s3, s3
1033; VI-NEXT:    v_max_f16_e64 v3, s0, s0
1034; VI-NEXT:    v_max_f16_e64 v2, s2, s2
1035; VI-NEXT:    v_min_f16_e32 v1, 0x4200, v1
1036; VI-NEXT:    v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1037; VI-NEXT:    s_lshr_b32 s0, s2, 16
1038; VI-NEXT:    v_or_b32_e32 v1, v1, v0
1039; VI-NEXT:    v_min_f16_e32 v0, 0x4800, v2
1040; VI-NEXT:    v_max_f16_e64 v2, s0, s0
1041; VI-NEXT:    v_mov_b32_e32 v3, 0x4000
1042; VI-NEXT:    v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1043; VI-NEXT:    v_or_b32_e32 v0, v0, v2
1044; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1045; VI-NEXT:    s_endpgm
1046;
1047; GFX9-LABEL: fmin_v4f16_imm_a:
1048; GFX9:       ; %bb.0: ; %entry
1049; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1050; GFX9-NEXT:    s_mov_b32 s8, 0x44004200
1051; GFX9-NEXT:    s_mov_b32 s9, 0x40004800
1052; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1053; GFX9-NEXT:    s_mov_b32 s6, -1
1054; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1055; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1056; GFX9-NEXT:    s_mov_b32 s4, s0
1057; GFX9-NEXT:    s_mov_b32 s5, s1
1058; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1059; GFX9-NEXT:    v_pk_max_f16 v0, s3, s3
1060; GFX9-NEXT:    v_pk_max_f16 v2, s2, s2
1061; GFX9-NEXT:    v_pk_min_f16 v1, v0, s8
1062; GFX9-NEXT:    v_pk_min_f16 v0, v2, s9
1063; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1064; GFX9-NEXT:    s_endpgm
1065;
1066; GFX10-LABEL: fmin_v4f16_imm_a:
1067; GFX10:       ; %bb.0: ; %entry
1068; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1069; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1070; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1071; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1072; GFX10-NEXT:    v_pk_max_f16 v0, s3, s3
1073; GFX10-NEXT:    v_pk_max_f16 v2, s2, s2
1074; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1075; GFX10-NEXT:    s_mov_b32 s2, -1
1076; GFX10-NEXT:    v_pk_min_f16 v1, 0x44004200, v0
1077; GFX10-NEXT:    v_pk_min_f16 v0, 0x40004800, v2
1078; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1079; GFX10-NEXT:    s_endpgm
1080;
1081; GFX11-LABEL: fmin_v4f16_imm_a:
1082; GFX11:       ; %bb.0: ; %entry
1083; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1084; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1085; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
1086; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1087; GFX11-NEXT:    v_pk_max_f16 v0, s3, s3
1088; GFX11-NEXT:    v_pk_max_f16 v2, s2, s2
1089; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1090; GFX11-NEXT:    s_mov_b32 s2, -1
1091; GFX11-NEXT:    v_pk_min_f16 v1, 0x44004200, v0
1092; GFX11-NEXT:    v_pk_min_f16 v0, 0x40004800, v2
1093; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1094; GFX11-NEXT:    s_endpgm
1095    ptr addrspace(1) %r,
1096    ptr addrspace(1) %b) #0 {
1097entry:
1098  %b.val = load <4 x half>, ptr addrspace(1) %b
1099  %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val)
1100  store <4 x half> %r.val, ptr addrspace(1) %r
1101  ret void
1102}
1103
1104attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
1105