xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=SI %s
3; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
7
8declare half @llvm.maxnum.f16(half %a, half %b)
9declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
10declare <3 x half> @llvm.maxnum.v3f16(<3 x half> %a, <3 x half> %b)
11declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b)
12
13define amdgpu_kernel void @maxnum_f16(
14; SI-LABEL: maxnum_f16:
15; SI:       ; %bb.0: ; %entry
16; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
17; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
18; SI-NEXT:    s_mov_b32 s7, 0xf000
19; SI-NEXT:    s_mov_b32 s6, -1
20; SI-NEXT:    s_mov_b32 s14, s6
21; SI-NEXT:    s_waitcnt lgkmcnt(0)
22; SI-NEXT:    s_mov_b32 s12, s2
23; SI-NEXT:    s_mov_b32 s13, s3
24; SI-NEXT:    s_mov_b32 s15, s7
25; SI-NEXT:    s_mov_b32 s10, s6
26; SI-NEXT:    s_mov_b32 s11, s7
27; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
28; SI-NEXT:    s_waitcnt vmcnt(0)
29; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
30; SI-NEXT:    s_waitcnt vmcnt(0)
31; SI-NEXT:    s_mov_b32 s4, s0
32; SI-NEXT:    s_mov_b32 s5, s1
33; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
34; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
35; SI-NEXT:    v_max_f32_e32 v0, v0, v1
36; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
37; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
38; SI-NEXT:    s_endpgm
39;
40; VI-LABEL: maxnum_f16:
41; VI:       ; %bb.0: ; %entry
42; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
43; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
44; VI-NEXT:    s_mov_b32 s7, 0xf000
45; VI-NEXT:    s_mov_b32 s6, -1
46; VI-NEXT:    s_mov_b32 s14, s6
47; VI-NEXT:    s_waitcnt lgkmcnt(0)
48; VI-NEXT:    s_mov_b32 s12, s2
49; VI-NEXT:    s_mov_b32 s13, s3
50; VI-NEXT:    s_mov_b32 s15, s7
51; VI-NEXT:    s_mov_b32 s10, s6
52; VI-NEXT:    s_mov_b32 s11, s7
53; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
54; VI-NEXT:    s_waitcnt vmcnt(0)
55; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
56; VI-NEXT:    s_waitcnt vmcnt(0)
57; VI-NEXT:    s_mov_b32 s4, s0
58; VI-NEXT:    s_mov_b32 s5, s1
59; VI-NEXT:    v_max_f16_e32 v0, v0, v0
60; VI-NEXT:    v_max_f16_e32 v1, v1, v1
61; VI-NEXT:    v_max_f16_e32 v0, v0, v1
62; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
63; VI-NEXT:    s_endpgm
64;
65; GFX9-LABEL: maxnum_f16:
66; GFX9:       ; %bb.0: ; %entry
67; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
68; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
69; GFX9-NEXT:    s_mov_b32 s7, 0xf000
70; GFX9-NEXT:    s_mov_b32 s6, -1
71; GFX9-NEXT:    s_mov_b32 s14, s6
72; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
73; GFX9-NEXT:    s_mov_b32 s12, s2
74; GFX9-NEXT:    s_mov_b32 s13, s3
75; GFX9-NEXT:    s_mov_b32 s15, s7
76; GFX9-NEXT:    s_mov_b32 s10, s6
77; GFX9-NEXT:    s_mov_b32 s11, s7
78; GFX9-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
79; GFX9-NEXT:    s_waitcnt vmcnt(0)
80; GFX9-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
81; GFX9-NEXT:    s_waitcnt vmcnt(0)
82; GFX9-NEXT:    s_mov_b32 s4, s0
83; GFX9-NEXT:    s_mov_b32 s5, s1
84; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
85; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
86; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
87; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
88; GFX9-NEXT:    s_endpgm
89;
90; GFX10-LABEL: maxnum_f16:
91; GFX10:       ; %bb.0: ; %entry
92; GFX10-NEXT:    s_clause 0x1
93; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
94; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
95; GFX10-NEXT:    s_mov_b32 s6, -1
96; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
97; GFX10-NEXT:    s_mov_b32 s14, s6
98; GFX10-NEXT:    s_mov_b32 s15, s7
99; GFX10-NEXT:    s_mov_b32 s10, s6
100; GFX10-NEXT:    s_mov_b32 s11, s7
101; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
102; GFX10-NEXT:    s_mov_b32 s12, s2
103; GFX10-NEXT:    s_mov_b32 s13, s3
104; GFX10-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc dlc
105; GFX10-NEXT:    s_waitcnt vmcnt(0)
106; GFX10-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc dlc
107; GFX10-NEXT:    s_waitcnt vmcnt(0)
108; GFX10-NEXT:    s_mov_b32 s4, s0
109; GFX10-NEXT:    s_mov_b32 s5, s1
110; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
111; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
112; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
113; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
114; GFX10-NEXT:    s_endpgm
115;
116; GFX11-LABEL: maxnum_f16:
117; GFX11:       ; %bb.0: ; %entry
118; GFX11-NEXT:    s_clause 0x1
119; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
120; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
121; GFX11-NEXT:    s_mov_b32 s10, -1
122; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
123; GFX11-NEXT:    s_mov_b32 s14, s10
124; GFX11-NEXT:    s_mov_b32 s15, s11
125; GFX11-NEXT:    s_mov_b32 s6, s10
126; GFX11-NEXT:    s_mov_b32 s7, s11
127; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
128; GFX11-NEXT:    s_mov_b32 s12, s2
129; GFX11-NEXT:    s_mov_b32 s13, s3
130; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
131; GFX11-NEXT:    s_waitcnt vmcnt(0)
132; GFX11-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
133; GFX11-NEXT:    s_waitcnt vmcnt(0)
134; GFX11-NEXT:    s_mov_b32 s8, s0
135; GFX11-NEXT:    s_mov_b32 s9, s1
136; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
137; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
138; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
139; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
140; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
141; GFX11-NEXT:    s_endpgm
142    ptr addrspace(1) %r,
143    ptr addrspace(1) %a,
144    ptr addrspace(1) %b) #0 {
145entry:
146  %a.val = load volatile half, ptr addrspace(1) %a
147  %b.val = load volatile half, ptr addrspace(1) %b
148  %r.val = call half @llvm.maxnum.f16(half %a.val, half %b.val)
149  store half %r.val, ptr addrspace(1) %r
150  ret void
151}
152
153define amdgpu_kernel void @maxnum_f16_imm_a(
154; SI-LABEL: maxnum_f16_imm_a:
155; SI:       ; %bb.0: ; %entry
156; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
157; SI-NEXT:    s_mov_b32 s7, 0xf000
158; SI-NEXT:    s_mov_b32 s6, -1
159; SI-NEXT:    s_mov_b32 s10, s6
160; SI-NEXT:    s_mov_b32 s11, s7
161; SI-NEXT:    s_waitcnt lgkmcnt(0)
162; SI-NEXT:    s_mov_b32 s8, s2
163; SI-NEXT:    s_mov_b32 s9, s3
164; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
165; SI-NEXT:    s_mov_b32 s4, s0
166; SI-NEXT:    s_mov_b32 s5, s1
167; SI-NEXT:    s_waitcnt vmcnt(0)
168; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
169; SI-NEXT:    v_max_f32_e32 v0, 0x40400000, v0
170; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
171; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
172; SI-NEXT:    s_endpgm
173;
174; VI-LABEL: maxnum_f16_imm_a:
175; VI:       ; %bb.0: ; %entry
176; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
177; VI-NEXT:    s_mov_b32 s7, 0xf000
178; VI-NEXT:    s_mov_b32 s6, -1
179; VI-NEXT:    s_mov_b32 s10, s6
180; VI-NEXT:    s_mov_b32 s11, s7
181; VI-NEXT:    s_waitcnt lgkmcnt(0)
182; VI-NEXT:    s_mov_b32 s8, s2
183; VI-NEXT:    s_mov_b32 s9, s3
184; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
185; VI-NEXT:    s_mov_b32 s4, s0
186; VI-NEXT:    s_mov_b32 s5, s1
187; VI-NEXT:    s_waitcnt vmcnt(0)
188; VI-NEXT:    v_max_f16_e32 v0, v0, v0
189; VI-NEXT:    v_max_f16_e32 v0, 0x4200, v0
190; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
191; VI-NEXT:    s_endpgm
192;
193; GFX9-LABEL: maxnum_f16_imm_a:
194; GFX9:       ; %bb.0: ; %entry
195; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
196; GFX9-NEXT:    s_mov_b32 s7, 0xf000
197; GFX9-NEXT:    s_mov_b32 s6, -1
198; GFX9-NEXT:    s_mov_b32 s10, s6
199; GFX9-NEXT:    s_mov_b32 s11, s7
200; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
201; GFX9-NEXT:    s_mov_b32 s8, s2
202; GFX9-NEXT:    s_mov_b32 s9, s3
203; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
204; GFX9-NEXT:    s_mov_b32 s4, s0
205; GFX9-NEXT:    s_mov_b32 s5, s1
206; GFX9-NEXT:    s_waitcnt vmcnt(0)
207; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
208; GFX9-NEXT:    v_max_f16_e32 v0, 0x4200, v0
209; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
210; GFX9-NEXT:    s_endpgm
211;
212; GFX10-LABEL: maxnum_f16_imm_a:
213; GFX10:       ; %bb.0: ; %entry
214; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
215; GFX10-NEXT:    s_mov_b32 s6, -1
216; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
217; GFX10-NEXT:    s_mov_b32 s10, s6
218; GFX10-NEXT:    s_mov_b32 s11, s7
219; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
220; GFX10-NEXT:    s_mov_b32 s8, s2
221; GFX10-NEXT:    s_mov_b32 s9, s3
222; GFX10-NEXT:    s_mov_b32 s4, s0
223; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
224; GFX10-NEXT:    s_mov_b32 s5, s1
225; GFX10-NEXT:    s_waitcnt vmcnt(0)
226; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
227; GFX10-NEXT:    v_max_f16_e32 v0, 0x4200, v0
228; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
229; GFX10-NEXT:    s_endpgm
230;
231; GFX11-LABEL: maxnum_f16_imm_a:
232; GFX11:       ; %bb.0: ; %entry
233; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
234; GFX11-NEXT:    s_mov_b32 s6, -1
235; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
236; GFX11-NEXT:    s_mov_b32 s10, s6
237; GFX11-NEXT:    s_mov_b32 s11, s7
238; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
239; GFX11-NEXT:    s_mov_b32 s8, s2
240; GFX11-NEXT:    s_mov_b32 s9, s3
241; GFX11-NEXT:    s_mov_b32 s4, s0
242; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
243; GFX11-NEXT:    s_mov_b32 s5, s1
244; GFX11-NEXT:    s_waitcnt vmcnt(0)
245; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
246; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
247; GFX11-NEXT:    v_max_f16_e32 v0, 0x4200, v0
248; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
249; GFX11-NEXT:    s_endpgm
250    ptr addrspace(1) %r,
251    ptr addrspace(1) %b) #0 {
252entry:
253  %b.val = load half, ptr addrspace(1) %b
254  %r.val = call half @llvm.maxnum.f16(half 3.0, half %b.val)
255  store half %r.val, ptr addrspace(1) %r
256  ret void
257}
258
259define amdgpu_kernel void @maxnum_f16_imm_b(
260; SI-LABEL: maxnum_f16_imm_b:
261; SI:       ; %bb.0: ; %entry
262; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
263; SI-NEXT:    s_mov_b32 s7, 0xf000
264; SI-NEXT:    s_mov_b32 s6, -1
265; SI-NEXT:    s_mov_b32 s10, s6
266; SI-NEXT:    s_mov_b32 s11, s7
267; SI-NEXT:    s_waitcnt lgkmcnt(0)
268; SI-NEXT:    s_mov_b32 s8, s2
269; SI-NEXT:    s_mov_b32 s9, s3
270; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
271; SI-NEXT:    s_mov_b32 s4, s0
272; SI-NEXT:    s_mov_b32 s5, s1
273; SI-NEXT:    s_waitcnt vmcnt(0)
274; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
275; SI-NEXT:    v_max_f32_e32 v0, 4.0, v0
276; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
277; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
278; SI-NEXT:    s_endpgm
279;
280; VI-LABEL: maxnum_f16_imm_b:
281; VI:       ; %bb.0: ; %entry
282; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
283; VI-NEXT:    s_mov_b32 s7, 0xf000
284; VI-NEXT:    s_mov_b32 s6, -1
285; VI-NEXT:    s_mov_b32 s10, s6
286; VI-NEXT:    s_mov_b32 s11, s7
287; VI-NEXT:    s_waitcnt lgkmcnt(0)
288; VI-NEXT:    s_mov_b32 s8, s2
289; VI-NEXT:    s_mov_b32 s9, s3
290; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
291; VI-NEXT:    s_mov_b32 s4, s0
292; VI-NEXT:    s_mov_b32 s5, s1
293; VI-NEXT:    s_waitcnt vmcnt(0)
294; VI-NEXT:    v_max_f16_e32 v0, v0, v0
295; VI-NEXT:    v_max_f16_e32 v0, 4.0, v0
296; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
297; VI-NEXT:    s_endpgm
298;
299; GFX9-LABEL: maxnum_f16_imm_b:
300; GFX9:       ; %bb.0: ; %entry
301; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
302; GFX9-NEXT:    s_mov_b32 s7, 0xf000
303; GFX9-NEXT:    s_mov_b32 s6, -1
304; GFX9-NEXT:    s_mov_b32 s10, s6
305; GFX9-NEXT:    s_mov_b32 s11, s7
306; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
307; GFX9-NEXT:    s_mov_b32 s8, s2
308; GFX9-NEXT:    s_mov_b32 s9, s3
309; GFX9-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
310; GFX9-NEXT:    s_mov_b32 s4, s0
311; GFX9-NEXT:    s_mov_b32 s5, s1
312; GFX9-NEXT:    s_waitcnt vmcnt(0)
313; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
314; GFX9-NEXT:    v_max_f16_e32 v0, 4.0, v0
315; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
316; GFX9-NEXT:    s_endpgm
317;
318; GFX10-LABEL: maxnum_f16_imm_b:
319; GFX10:       ; %bb.0: ; %entry
320; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
321; GFX10-NEXT:    s_mov_b32 s6, -1
322; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
323; GFX10-NEXT:    s_mov_b32 s10, s6
324; GFX10-NEXT:    s_mov_b32 s11, s7
325; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
326; GFX10-NEXT:    s_mov_b32 s8, s2
327; GFX10-NEXT:    s_mov_b32 s9, s3
328; GFX10-NEXT:    s_mov_b32 s4, s0
329; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
330; GFX10-NEXT:    s_mov_b32 s5, s1
331; GFX10-NEXT:    s_waitcnt vmcnt(0)
332; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
333; GFX10-NEXT:    v_max_f16_e32 v0, 4.0, v0
334; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
335; GFX10-NEXT:    s_endpgm
336;
337; GFX11-LABEL: maxnum_f16_imm_b:
338; GFX11:       ; %bb.0: ; %entry
339; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
340; GFX11-NEXT:    s_mov_b32 s6, -1
341; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
342; GFX11-NEXT:    s_mov_b32 s10, s6
343; GFX11-NEXT:    s_mov_b32 s11, s7
344; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
345; GFX11-NEXT:    s_mov_b32 s8, s2
346; GFX11-NEXT:    s_mov_b32 s9, s3
347; GFX11-NEXT:    s_mov_b32 s4, s0
348; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
349; GFX11-NEXT:    s_mov_b32 s5, s1
350; GFX11-NEXT:    s_waitcnt vmcnt(0)
351; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
352; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
353; GFX11-NEXT:    v_max_f16_e32 v0, 4.0, v0
354; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
355; GFX11-NEXT:    s_endpgm
356    ptr addrspace(1) %r,
357    ptr addrspace(1) %a) #0 {
358entry:
359  %a.val = load half, ptr addrspace(1) %a
360  %r.val = call half @llvm.maxnum.f16(half %a.val, half 4.0)
361  store half %r.val, ptr addrspace(1) %r
362  ret void
363}
364
365define amdgpu_kernel void @maxnum_v2f16(
366; SI-LABEL: maxnum_v2f16:
367; SI:       ; %bb.0: ; %entry
368; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
369; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
370; SI-NEXT:    s_waitcnt lgkmcnt(0)
371; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
372; SI-NEXT:    s_load_dword s4, s[4:5], 0x0
373; SI-NEXT:    s_mov_b32 s3, 0xf000
374; SI-NEXT:    s_waitcnt lgkmcnt(0)
375; SI-NEXT:    s_lshr_b32 s5, s2, 16
376; SI-NEXT:    s_lshr_b32 s6, s4, 16
377; SI-NEXT:    v_cvt_f32_f16_e32 v0, s5
378; SI-NEXT:    v_cvt_f32_f16_e32 v1, s6
379; SI-NEXT:    v_cvt_f32_f16_e32 v2, s2
380; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
381; SI-NEXT:    s_mov_b32 s2, -1
382; SI-NEXT:    v_max_f32_e32 v0, v0, v1
383; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
384; SI-NEXT:    v_max_f32_e32 v1, v2, v3
385; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
386; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
387; SI-NEXT:    v_or_b32_e32 v0, v1, v0
388; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
389; SI-NEXT:    s_endpgm
390;
391; VI-LABEL: maxnum_v2f16:
392; VI:       ; %bb.0: ; %entry
393; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
394; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
395; VI-NEXT:    s_mov_b32 s7, 0xf000
396; VI-NEXT:    s_mov_b32 s6, -1
397; VI-NEXT:    s_waitcnt lgkmcnt(0)
398; VI-NEXT:    s_load_dword s8, s[8:9], 0x0
399; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
400; VI-NEXT:    s_mov_b32 s4, s0
401; VI-NEXT:    s_mov_b32 s5, s1
402; VI-NEXT:    s_waitcnt lgkmcnt(0)
403; VI-NEXT:    v_max_f16_e64 v0, s8, s8
404; VI-NEXT:    v_max_f16_e64 v1, s2, s2
405; VI-NEXT:    s_lshr_b32 s0, s8, 16
406; VI-NEXT:    v_max_f16_e32 v0, v1, v0
407; VI-NEXT:    v_max_f16_e64 v1, s0, s0
408; VI-NEXT:    s_lshr_b32 s0, s2, 16
409; VI-NEXT:    v_max_f16_e64 v2, s0, s0
410; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
411; VI-NEXT:    v_or_b32_e32 v0, v0, v1
412; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
413; VI-NEXT:    s_endpgm
414;
415; GFX9-LABEL: maxnum_v2f16:
416; GFX9:       ; %bb.0: ; %entry
417; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
418; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
419; GFX9-NEXT:    s_mov_b32 s7, 0xf000
420; GFX9-NEXT:    s_mov_b32 s6, -1
421; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
422; GFX9-NEXT:    s_load_dword s10, s[8:9], 0x0
423; GFX9-NEXT:    s_load_dword s11, s[2:3], 0x0
424; GFX9-NEXT:    s_mov_b32 s4, s0
425; GFX9-NEXT:    s_mov_b32 s5, s1
426; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
427; GFX9-NEXT:    v_pk_max_f16 v0, s10, s10
428; GFX9-NEXT:    v_pk_max_f16 v1, s11, s11
429; GFX9-NEXT:    v_pk_max_f16 v0, v1, v0
430; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
431; GFX9-NEXT:    s_endpgm
432;
433; GFX10-LABEL: maxnum_v2f16:
434; GFX10:       ; %bb.0: ; %entry
435; GFX10-NEXT:    s_clause 0x1
436; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
437; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
438; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
439; GFX10-NEXT:    s_load_dword s4, s[6:7], 0x0
440; GFX10-NEXT:    s_load_dword s5, s[2:3], 0x0
441; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
442; GFX10-NEXT:    s_mov_b32 s2, -1
443; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
444; GFX10-NEXT:    v_pk_max_f16 v0, s4, s4
445; GFX10-NEXT:    v_pk_max_f16 v1, s5, s5
446; GFX10-NEXT:    v_pk_max_f16 v0, v1, v0
447; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
448; GFX10-NEXT:    s_endpgm
449;
450; GFX11-LABEL: maxnum_v2f16:
451; GFX11:       ; %bb.0: ; %entry
452; GFX11-NEXT:    s_clause 0x1
453; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
454; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
455; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
456; GFX11-NEXT:    s_load_b32 s4, s[6:7], 0x0
457; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
458; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
459; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
460; GFX11-NEXT:    v_pk_max_f16 v0, s4, s4
461; GFX11-NEXT:    v_pk_max_f16 v1, s2, s2
462; GFX11-NEXT:    s_mov_b32 s2, -1
463; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
464; GFX11-NEXT:    v_pk_max_f16 v0, v1, v0
465; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
466; GFX11-NEXT:    s_endpgm
467    ptr addrspace(1) %r,
468    ptr addrspace(1) %a,
469    ptr addrspace(1) %b) #0 {
470entry:
471  %a.val = load <2 x half>, ptr addrspace(1) %a
472  %b.val = load <2 x half>, ptr addrspace(1) %b
473  %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> %b.val)
474  store <2 x half> %r.val, ptr addrspace(1) %r
475  ret void
476}
477
478define amdgpu_kernel void @maxnum_v2f16_imm_a(
479; SI-LABEL: maxnum_v2f16_imm_a:
480; SI:       ; %bb.0: ; %entry
481; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
482; SI-NEXT:    s_waitcnt lgkmcnt(0)
483; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
484; SI-NEXT:    s_waitcnt lgkmcnt(0)
485; SI-NEXT:    s_lshr_b32 s3, s2, 16
486; SI-NEXT:    v_cvt_f32_f16_e32 v0, s3
487; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
488; SI-NEXT:    s_mov_b32 s3, 0xf000
489; SI-NEXT:    s_mov_b32 s2, -1
490; SI-NEXT:    v_max_f32_e32 v0, 4.0, v0
491; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
492; SI-NEXT:    v_max_f32_e32 v1, 0x40400000, v1
493; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
494; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
495; SI-NEXT:    v_or_b32_e32 v0, v1, v0
496; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
497; SI-NEXT:    s_endpgm
498;
499; VI-LABEL: maxnum_v2f16_imm_a:
500; VI:       ; %bb.0: ; %entry
501; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
502; VI-NEXT:    v_mov_b32_e32 v2, 0x4400
503; VI-NEXT:    s_waitcnt lgkmcnt(0)
504; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
505; VI-NEXT:    s_mov_b32 s3, 0xf000
506; VI-NEXT:    s_mov_b32 s2, -1
507; VI-NEXT:    s_waitcnt lgkmcnt(0)
508; VI-NEXT:    v_max_f16_e64 v0, s4, s4
509; VI-NEXT:    s_lshr_b32 s4, s4, 16
510; VI-NEXT:    v_max_f16_e64 v1, s4, s4
511; VI-NEXT:    v_max_f16_e32 v0, 0x4200, v0
512; VI-NEXT:    v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
513; VI-NEXT:    v_or_b32_e32 v0, v0, v1
514; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
515; VI-NEXT:    s_endpgm
516;
517; GFX9-LABEL: maxnum_v2f16_imm_a:
518; GFX9:       ; %bb.0: ; %entry
519; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
520; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
521; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x0
522; GFX9-NEXT:    s_mov_b32 s3, 0xf000
523; GFX9-NEXT:    s_mov_b32 s2, -1
524; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
525; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
526; GFX9-NEXT:    s_mov_b32 s4, 0x44004200
527; GFX9-NEXT:    v_pk_max_f16 v0, v0, s4
528; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
529; GFX9-NEXT:    s_endpgm
530;
531; GFX10-LABEL: maxnum_v2f16_imm_a:
532; GFX10:       ; %bb.0: ; %entry
533; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
534; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
535; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
536; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
537; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
538; GFX10-NEXT:    v_pk_max_f16 v0, s2, s2
539; GFX10-NEXT:    s_mov_b32 s2, -1
540; GFX10-NEXT:    v_pk_max_f16 v0, 0x44004200, v0
541; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
542; GFX10-NEXT:    s_endpgm
543;
544; GFX11-LABEL: maxnum_v2f16_imm_a:
545; GFX11:       ; %bb.0: ; %entry
546; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
547; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
548; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
549; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
550; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
551; GFX11-NEXT:    v_pk_max_f16 v0, s2, s2
552; GFX11-NEXT:    s_mov_b32 s2, -1
553; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
554; GFX11-NEXT:    v_pk_max_f16 v0, 0x44004200, v0
555; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
556; GFX11-NEXT:    s_endpgm
557    ptr addrspace(1) %r,
558    ptr addrspace(1) %b) #0 {
559entry:
560  %b.val = load <2 x half>, ptr addrspace(1) %b
561  %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val)
562  store <2 x half> %r.val, ptr addrspace(1) %r
563  ret void
564}
565
566define amdgpu_kernel void @maxnum_v2f16_imm_b(
567; SI-LABEL: maxnum_v2f16_imm_b:
568; SI:       ; %bb.0: ; %entry
569; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
570; SI-NEXT:    s_waitcnt lgkmcnt(0)
571; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
572; SI-NEXT:    s_waitcnt lgkmcnt(0)
573; SI-NEXT:    s_lshr_b32 s3, s2, 16
574; SI-NEXT:    v_cvt_f32_f16_e32 v0, s3
575; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
576; SI-NEXT:    s_mov_b32 s3, 0xf000
577; SI-NEXT:    s_mov_b32 s2, -1
578; SI-NEXT:    v_max_f32_e32 v0, 0x40400000, v0
579; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
580; SI-NEXT:    v_max_f32_e32 v1, 4.0, v1
581; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
582; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
583; SI-NEXT:    v_or_b32_e32 v0, v1, v0
584; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
585; SI-NEXT:    s_endpgm
586;
587; VI-LABEL: maxnum_v2f16_imm_b:
588; VI:       ; %bb.0: ; %entry
589; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
590; VI-NEXT:    v_mov_b32_e32 v2, 0x4200
591; VI-NEXT:    s_waitcnt lgkmcnt(0)
592; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
593; VI-NEXT:    s_mov_b32 s3, 0xf000
594; VI-NEXT:    s_mov_b32 s2, -1
595; VI-NEXT:    s_waitcnt lgkmcnt(0)
596; VI-NEXT:    v_max_f16_e64 v0, s4, s4
597; VI-NEXT:    s_lshr_b32 s4, s4, 16
598; VI-NEXT:    v_max_f16_e64 v1, s4, s4
599; VI-NEXT:    v_max_f16_e32 v0, 4.0, v0
600; VI-NEXT:    v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
601; VI-NEXT:    v_or_b32_e32 v0, v0, v1
602; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
603; VI-NEXT:    s_endpgm
604;
605; GFX9-LABEL: maxnum_v2f16_imm_b:
606; GFX9:       ; %bb.0: ; %entry
607; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
608; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
609; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x0
610; GFX9-NEXT:    s_mov_b32 s3, 0xf000
611; GFX9-NEXT:    s_mov_b32 s2, -1
612; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
613; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
614; GFX9-NEXT:    s_mov_b32 s4, 0x42004400
615; GFX9-NEXT:    v_pk_max_f16 v0, v0, s4
616; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
617; GFX9-NEXT:    s_endpgm
618;
619; GFX10-LABEL: maxnum_v2f16_imm_b:
620; GFX10:       ; %bb.0: ; %entry
621; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
622; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
623; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
624; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
625; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
626; GFX10-NEXT:    v_pk_max_f16 v0, s2, s2
627; GFX10-NEXT:    s_mov_b32 s2, -1
628; GFX10-NEXT:    v_pk_max_f16 v0, 0x42004400, v0
629; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
630; GFX10-NEXT:    s_endpgm
631;
632; GFX11-LABEL: maxnum_v2f16_imm_b:
633; GFX11:       ; %bb.0: ; %entry
634; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
635; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
636; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
637; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
638; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
639; GFX11-NEXT:    v_pk_max_f16 v0, s2, s2
640; GFX11-NEXT:    s_mov_b32 s2, -1
641; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
642; GFX11-NEXT:    v_pk_max_f16 v0, 0x42004400, v0
643; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
644; GFX11-NEXT:    s_endpgm
645    ptr addrspace(1) %r,
646    ptr addrspace(1) %a) #0 {
647entry:
648  %a.val = load <2 x half>, ptr addrspace(1) %a
649  %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>)
650  store <2 x half> %r.val, ptr addrspace(1) %r
651  ret void
652}
653
654; FIXME: Scalarize with undef half
655define amdgpu_kernel void @maxnum_v3f16(
656; SI-LABEL: maxnum_v3f16:
657; SI:       ; %bb.0: ; %entry
658; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
659; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
660; SI-NEXT:    s_waitcnt lgkmcnt(0)
661; SI-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x0
662; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
663; SI-NEXT:    s_mov_b32 s3, 0xf000
664; SI-NEXT:    s_mov_b32 s2, -1
665; SI-NEXT:    s_waitcnt lgkmcnt(0)
666; SI-NEXT:    v_cvt_f32_f16_e32 v0, s7
667; SI-NEXT:    s_lshr_b32 s7, s6, 16
668; SI-NEXT:    s_lshr_b32 s8, s4, 16
669; SI-NEXT:    v_cvt_f32_f16_e32 v1, s7
670; SI-NEXT:    v_cvt_f32_f16_e32 v2, s8
671; SI-NEXT:    v_cvt_f32_f16_e32 v3, s6
672; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
673; SI-NEXT:    v_cvt_f32_f16_e32 v5, s5
674; SI-NEXT:    v_max_f32_e32 v1, v1, v2
675; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
676; SI-NEXT:    v_max_f32_e32 v2, v3, v4
677; SI-NEXT:    v_max_f32_e32 v0, v0, v5
678; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
679; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
680; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
681; SI-NEXT:    v_or_b32_e32 v1, v2, v1
682; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
683; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
684; SI-NEXT:    s_endpgm
685;
686; VI-LABEL: maxnum_v3f16:
687; VI:       ; %bb.0: ; %entry
688; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
689; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
690; VI-NEXT:    s_mov_b32 s7, 0xf000
691; VI-NEXT:    s_mov_b32 s6, -1
692; VI-NEXT:    s_waitcnt lgkmcnt(0)
693; VI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
694; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
695; VI-NEXT:    s_mov_b32 s4, s0
696; VI-NEXT:    s_mov_b32 s5, s1
697; VI-NEXT:    s_waitcnt lgkmcnt(0)
698; VI-NEXT:    v_max_f16_e64 v0, s8, s8
699; VI-NEXT:    v_max_f16_e64 v1, s2, s2
700; VI-NEXT:    s_lshr_b32 s0, s8, 16
701; VI-NEXT:    v_max_f16_e32 v0, v1, v0
702; VI-NEXT:    v_max_f16_e64 v1, s0, s0
703; VI-NEXT:    s_lshr_b32 s0, s2, 16
704; VI-NEXT:    v_max_f16_e64 v2, s0, s0
705; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
706; VI-NEXT:    v_or_b32_e32 v0, v0, v1
707; VI-NEXT:    v_max_f16_e64 v1, s9, s9
708; VI-NEXT:    v_max_f16_e64 v2, s3, s3
709; VI-NEXT:    v_max_f16_e32 v1, v2, v1
710; VI-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
711; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
712; VI-NEXT:    s_endpgm
713;
714; GFX9-LABEL: maxnum_v3f16:
715; GFX9:       ; %bb.0: ; %entry
716; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
717; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
718; GFX9-NEXT:    s_mov_b32 s7, 0xf000
719; GFX9-NEXT:    s_mov_b32 s6, -1
720; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
721; GFX9-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
722; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0x0
723; GFX9-NEXT:    s_mov_b32 s4, s0
724; GFX9-NEXT:    s_mov_b32 s5, s1
725; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
726; GFX9-NEXT:    v_pk_max_f16 v0, s10, s10
727; GFX9-NEXT:    v_pk_max_f16 v1, s12, s12
728; GFX9-NEXT:    v_pk_max_f16 v2, s11, s11
729; GFX9-NEXT:    v_pk_max_f16 v0, v1, v0
730; GFX9-NEXT:    v_pk_max_f16 v1, s13, s13
731; GFX9-NEXT:    v_pk_max_f16 v1, v1, v2
732; GFX9-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
733; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
734; GFX9-NEXT:    s_endpgm
735;
736; GFX10-LABEL: maxnum_v3f16:
737; GFX10:       ; %bb.0: ; %entry
738; GFX10-NEXT:    s_clause 0x1
739; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
740; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
741; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
742; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
743; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x0
744; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
745; GFX10-NEXT:    s_mov_b32 s2, -1
746; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
747; GFX10-NEXT:    v_pk_max_f16 v1, s5, s5
748; GFX10-NEXT:    v_pk_max_f16 v2, s9, s9
749; GFX10-NEXT:    v_pk_max_f16 v0, s4, s4
750; GFX10-NEXT:    v_pk_max_f16 v3, s8, s8
751; GFX10-NEXT:    v_pk_max_f16 v1, v2, v1
752; GFX10-NEXT:    v_pk_max_f16 v0, v3, v0
753; GFX10-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
754; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
755; GFX10-NEXT:    s_endpgm
756;
757; GFX11-LABEL: maxnum_v3f16:
758; GFX11:       ; %bb.0: ; %entry
759; GFX11-NEXT:    s_clause 0x1
760; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
761; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
762; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
763; GFX11-NEXT:    s_load_b64 s[4:5], s[6:7], 0x0
764; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
765; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
766; GFX11-NEXT:    v_pk_max_f16 v1, s5, s5
767; GFX11-NEXT:    v_pk_max_f16 v2, s3, s3
768; GFX11-NEXT:    v_pk_max_f16 v0, s4, s4
769; GFX11-NEXT:    v_pk_max_f16 v3, s2, s2
770; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
771; GFX11-NEXT:    s_mov_b32 s2, -1
772; GFX11-NEXT:    v_pk_max_f16 v1, v2, v1
773; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
774; GFX11-NEXT:    v_pk_max_f16 v0, v3, v0
775; GFX11-NEXT:    s_clause 0x1
776; GFX11-NEXT:    buffer_store_b16 v1, off, s[0:3], 0 offset:4
777; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
778; GFX11-NEXT:    s_endpgm
779    ptr addrspace(1) %r,
780    ptr addrspace(1) %a,
781    ptr addrspace(1) %b) #0 {
782entry:
783  %a.val = load <3 x half>, ptr addrspace(1) %a
784  %b.val = load <3 x half>, ptr addrspace(1) %b
785  %r.val = call <3 x half> @llvm.maxnum.v3f16(<3 x half> %a.val, <3 x half> %b.val)
786  store <3 x half> %r.val, ptr addrspace(1) %r
787  ret void
788}
789
790define amdgpu_kernel void @maxnum_v4f16(
791; SI-LABEL: maxnum_v4f16:
792; SI:       ; %bb.0: ; %entry
793; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
794; SI-NEXT:    s_mov_b32 s3, 0xf000
795; SI-NEXT:    s_mov_b32 s2, -1
796; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
797; SI-NEXT:    s_waitcnt lgkmcnt(0)
798; SI-NEXT:    s_load_dwordx2 s[6:7], s[10:11], 0x0
799; SI-NEXT:    s_mov_b32 s0, s8
800; SI-NEXT:    s_mov_b32 s1, s9
801; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
802; SI-NEXT:    s_waitcnt lgkmcnt(0)
803; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
804; SI-NEXT:    s_lshr_b32 s6, s6, 16
805; SI-NEXT:    v_cvt_f32_f16_e32 v2, s6
806; SI-NEXT:    s_lshr_b32 s6, s7, 16
807; SI-NEXT:    v_cvt_f32_f16_e32 v3, s6
808; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
809; SI-NEXT:    s_lshr_b32 s6, s5, 16
810; SI-NEXT:    s_lshr_b32 s4, s4, 16
811; SI-NEXT:    v_cvt_f32_f16_e32 v5, s6
812; SI-NEXT:    v_cvt_f32_f16_e32 v7, s4
813; SI-NEXT:    v_cvt_f32_f16_e32 v1, s7
814; SI-NEXT:    v_cvt_f32_f16_e32 v6, s5
815; SI-NEXT:    v_max_f32_e32 v3, v3, v5
816; SI-NEXT:    v_max_f32_e32 v2, v2, v7
817; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
818; SI-NEXT:    v_max_f32_e32 v1, v1, v6
819; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
820; SI-NEXT:    v_max_f32_e32 v0, v0, v4
821; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
822; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
823; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
824; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
825; SI-NEXT:    v_or_b32_e32 v1, v1, v3
826; SI-NEXT:    v_or_b32_e32 v0, v0, v2
827; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
828; SI-NEXT:    s_endpgm
829;
830; VI-LABEL: maxnum_v4f16:
831; VI:       ; %bb.0: ; %entry
832; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
833; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
834; VI-NEXT:    s_mov_b32 s7, 0xf000
835; VI-NEXT:    s_mov_b32 s6, -1
836; VI-NEXT:    s_waitcnt lgkmcnt(0)
837; VI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
838; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
839; VI-NEXT:    s_mov_b32 s4, s0
840; VI-NEXT:    s_mov_b32 s5, s1
841; VI-NEXT:    s_waitcnt lgkmcnt(0)
842; VI-NEXT:    v_max_f16_e64 v0, s9, s9
843; VI-NEXT:    v_max_f16_e64 v1, s3, s3
844; VI-NEXT:    s_lshr_b32 s0, s9, 16
845; VI-NEXT:    v_max_f16_e32 v0, v1, v0
846; VI-NEXT:    v_max_f16_e64 v1, s0, s0
847; VI-NEXT:    s_lshr_b32 s0, s3, 16
848; VI-NEXT:    v_max_f16_e64 v2, s0, s0
849; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
850; VI-NEXT:    v_or_b32_e32 v1, v0, v1
851; VI-NEXT:    v_max_f16_e64 v0, s8, s8
852; VI-NEXT:    v_max_f16_e64 v2, s2, s2
853; VI-NEXT:    s_lshr_b32 s0, s8, 16
854; VI-NEXT:    v_max_f16_e32 v0, v2, v0
855; VI-NEXT:    v_max_f16_e64 v2, s0, s0
856; VI-NEXT:    s_lshr_b32 s0, s2, 16
857; VI-NEXT:    v_max_f16_e64 v3, s0, s0
858; VI-NEXT:    v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
859; VI-NEXT:    v_or_b32_e32 v0, v0, v2
860; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
861; VI-NEXT:    s_endpgm
862;
863; GFX9-LABEL: maxnum_v4f16:
864; GFX9:       ; %bb.0: ; %entry
865; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
866; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
867; GFX9-NEXT:    s_mov_b32 s7, 0xf000
868; GFX9-NEXT:    s_mov_b32 s6, -1
869; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
870; GFX9-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
871; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0x0
872; GFX9-NEXT:    s_mov_b32 s4, s0
873; GFX9-NEXT:    s_mov_b32 s5, s1
874; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
875; GFX9-NEXT:    v_pk_max_f16 v0, s11, s11
876; GFX9-NEXT:    v_pk_max_f16 v1, s13, s13
877; GFX9-NEXT:    v_pk_max_f16 v2, s10, s10
878; GFX9-NEXT:    v_pk_max_f16 v1, v1, v0
879; GFX9-NEXT:    v_pk_max_f16 v0, s12, s12
880; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
881; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
882; GFX9-NEXT:    s_endpgm
883;
884; GFX10-LABEL: maxnum_v4f16:
885; GFX10:       ; %bb.0: ; %entry
886; GFX10-NEXT:    s_clause 0x1
887; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
888; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
889; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
890; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
891; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x0
892; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
893; GFX10-NEXT:    s_mov_b32 s2, -1
894; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
895; GFX10-NEXT:    v_pk_max_f16 v0, s5, s5
896; GFX10-NEXT:    v_pk_max_f16 v1, s9, s9
897; GFX10-NEXT:    v_pk_max_f16 v2, s4, s4
898; GFX10-NEXT:    v_pk_max_f16 v3, s8, s8
899; GFX10-NEXT:    v_pk_max_f16 v1, v1, v0
900; GFX10-NEXT:    v_pk_max_f16 v0, v3, v2
901; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
902; GFX10-NEXT:    s_endpgm
903;
904; GFX11-LABEL: maxnum_v4f16:
905; GFX11:       ; %bb.0: ; %entry
906; GFX11-NEXT:    s_clause 0x1
907; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
908; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
909; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
910; GFX11-NEXT:    s_load_b64 s[4:5], s[6:7], 0x0
911; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
912; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
913; GFX11-NEXT:    v_pk_max_f16 v0, s5, s5
914; GFX11-NEXT:    v_pk_max_f16 v1, s3, s3
915; GFX11-NEXT:    v_pk_max_f16 v2, s4, s4
916; GFX11-NEXT:    v_pk_max_f16 v3, s2, s2
917; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
918; GFX11-NEXT:    s_mov_b32 s2, -1
919; GFX11-NEXT:    v_pk_max_f16 v1, v1, v0
920; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
921; GFX11-NEXT:    v_pk_max_f16 v0, v3, v2
922; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
923; GFX11-NEXT:    s_endpgm
924    ptr addrspace(1) %r,
925    ptr addrspace(1) %a,
926    ptr addrspace(1) %b) #0 {
927entry:
928  %a.val = load <4 x half>, ptr addrspace(1) %a
929  %b.val = load <4 x half>, ptr addrspace(1) %b
930  %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %a.val, <4 x half> %b.val)
931  store <4 x half> %r.val, ptr addrspace(1) %r
932  ret void
933}
934
935define amdgpu_kernel void @fmax_v4f16_imm_a(
936; SI-LABEL: fmax_v4f16_imm_a:
937; SI:       ; %bb.0: ; %entry
938; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
939; SI-NEXT:    s_waitcnt lgkmcnt(0)
940; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
941; SI-NEXT:    s_mov_b32 s3, 0xf000
942; SI-NEXT:    s_mov_b32 s2, -1
943; SI-NEXT:    s_waitcnt lgkmcnt(0)
944; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
945; SI-NEXT:    s_lshr_b32 s5, s5, 16
946; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
947; SI-NEXT:    s_lshr_b32 s4, s4, 16
948; SI-NEXT:    v_cvt_f32_f16_e32 v2, s5
949; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
950; SI-NEXT:    v_max_f32_e32 v1, 0x40400000, v1
951; SI-NEXT:    v_max_f32_e32 v0, 0x41000000, v0
952; SI-NEXT:    v_max_f32_e32 v2, 4.0, v2
953; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
954; SI-NEXT:    v_max_f32_e32 v3, 2.0, v3
955; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
956; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
957; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
958; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
959; SI-NEXT:    v_or_b32_e32 v1, v1, v2
960; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
961; SI-NEXT:    v_or_b32_e32 v0, v0, v2
962; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
963; SI-NEXT:    s_endpgm
964;
965; VI-LABEL: fmax_v4f16_imm_a:
966; VI:       ; %bb.0: ; %entry
967; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
968; VI-NEXT:    v_mov_b32_e32 v0, 0x4400
969; VI-NEXT:    s_mov_b32 s7, 0xf000
970; VI-NEXT:    s_mov_b32 s6, -1
971; VI-NEXT:    s_waitcnt lgkmcnt(0)
972; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
973; VI-NEXT:    s_mov_b32 s4, s0
974; VI-NEXT:    s_mov_b32 s5, s1
975; VI-NEXT:    s_waitcnt lgkmcnt(0)
976; VI-NEXT:    s_lshr_b32 s0, s3, 16
977; VI-NEXT:    v_max_f16_e64 v1, s3, s3
978; VI-NEXT:    v_max_f16_e64 v3, s0, s0
979; VI-NEXT:    v_max_f16_e64 v2, s2, s2
980; VI-NEXT:    v_max_f16_e32 v1, 0x4200, v1
981; VI-NEXT:    v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
982; VI-NEXT:    s_lshr_b32 s0, s2, 16
983; VI-NEXT:    v_or_b32_e32 v1, v1, v0
984; VI-NEXT:    v_max_f16_e32 v0, 0x4800, v2
985; VI-NEXT:    v_max_f16_e64 v2, s0, s0
986; VI-NEXT:    v_mov_b32_e32 v3, 0x4000
987; VI-NEXT:    v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
988; VI-NEXT:    v_or_b32_e32 v0, v0, v2
989; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
990; VI-NEXT:    s_endpgm
991;
992; GFX9-LABEL: fmax_v4f16_imm_a:
993; GFX9:       ; %bb.0: ; %entry
994; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
995; GFX9-NEXT:    s_mov_b32 s8, 0x44004200
996; GFX9-NEXT:    s_mov_b32 s9, 0x40004800
997; GFX9-NEXT:    s_mov_b32 s7, 0xf000
998; GFX9-NEXT:    s_mov_b32 s6, -1
999; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1000; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1001; GFX9-NEXT:    s_mov_b32 s4, s0
1002; GFX9-NEXT:    s_mov_b32 s5, s1
1003; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1004; GFX9-NEXT:    v_pk_max_f16 v0, s3, s3
1005; GFX9-NEXT:    v_pk_max_f16 v2, s2, s2
1006; GFX9-NEXT:    v_pk_max_f16 v1, v0, s8
1007; GFX9-NEXT:    v_pk_max_f16 v0, v2, s9
1008; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1009; GFX9-NEXT:    s_endpgm
1010;
1011; GFX10-LABEL: fmax_v4f16_imm_a:
1012; GFX10:       ; %bb.0: ; %entry
1013; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1014; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1015; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1016; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1017; GFX10-NEXT:    v_pk_max_f16 v0, s3, s3
1018; GFX10-NEXT:    v_pk_max_f16 v2, s2, s2
1019; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1020; GFX10-NEXT:    s_mov_b32 s2, -1
1021; GFX10-NEXT:    v_pk_max_f16 v1, 0x44004200, v0
1022; GFX10-NEXT:    v_pk_max_f16 v0, 0x40004800, v2
1023; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1024; GFX10-NEXT:    s_endpgm
1025;
1026; GFX11-LABEL: fmax_v4f16_imm_a:
1027; GFX11:       ; %bb.0: ; %entry
1028; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1029; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1030; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
1031; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1032; GFX11-NEXT:    v_pk_max_f16 v0, s3, s3
1033; GFX11-NEXT:    v_pk_max_f16 v2, s2, s2
1034; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1035; GFX11-NEXT:    s_mov_b32 s2, -1
1036; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1037; GFX11-NEXT:    v_pk_max_f16 v1, 0x44004200, v0
1038; GFX11-NEXT:    v_pk_max_f16 v0, 0x40004800, v2
1039; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1040; GFX11-NEXT:    s_endpgm
1041    ptr addrspace(1) %r,
1042    ptr addrspace(1) %b) #0 {
1043entry:
1044  %b.val = load <4 x half>, ptr addrspace(1) %b
1045  %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val)
1046  store <4 x half> %r.val, ptr addrspace(1) %r
1047  ret void
1048}
1049
1050attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
1051