xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fmax3.ll (revision 8b23ebb498bc67f03571b1d429771b28868b8932)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
7
8define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
9; SI-LABEL: test_fmax3_olt_0_f32:
10; SI:       ; %bb.0:
11; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
12; SI-NEXT:    s_mov_b32 s11, 0xf000
13; SI-NEXT:    s_mov_b32 s10, -1
14; SI-NEXT:    s_mov_b32 s14, s10
15; SI-NEXT:    s_mov_b32 s15, s11
16; SI-NEXT:    s_mov_b32 s18, s10
17; SI-NEXT:    s_mov_b32 s19, s11
18; SI-NEXT:    s_mov_b32 s22, s10
19; SI-NEXT:    s_mov_b32 s23, s11
20; SI-NEXT:    s_waitcnt lgkmcnt(0)
21; SI-NEXT:    s_mov_b32 s12, s2
22; SI-NEXT:    s_mov_b32 s13, s3
23; SI-NEXT:    s_mov_b32 s16, s4
24; SI-NEXT:    s_mov_b32 s17, s5
25; SI-NEXT:    s_mov_b32 s20, s6
26; SI-NEXT:    s_mov_b32 s21, s7
27; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
28; SI-NEXT:    s_waitcnt vmcnt(0)
29; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0 glc
30; SI-NEXT:    s_waitcnt vmcnt(0)
31; SI-NEXT:    buffer_load_dword v2, off, s[20:23], 0 glc
32; SI-NEXT:    s_waitcnt vmcnt(0)
33; SI-NEXT:    s_mov_b32 s8, s0
34; SI-NEXT:    s_mov_b32 s9, s1
35; SI-NEXT:    v_max3_f32 v0, v0, v1, v2
36; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
37; SI-NEXT:    s_endpgm
38;
39; VI-LABEL: test_fmax3_olt_0_f32:
40; VI:       ; %bb.0:
41; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
42; VI-NEXT:    s_mov_b32 s11, 0xf000
43; VI-NEXT:    s_mov_b32 s10, -1
44; VI-NEXT:    s_mov_b32 s14, s10
45; VI-NEXT:    s_mov_b32 s15, s11
46; VI-NEXT:    s_waitcnt lgkmcnt(0)
47; VI-NEXT:    s_mov_b32 s12, s2
48; VI-NEXT:    s_mov_b32 s13, s3
49; VI-NEXT:    s_mov_b32 s16, s4
50; VI-NEXT:    s_mov_b32 s17, s5
51; VI-NEXT:    s_mov_b32 s18, s10
52; VI-NEXT:    s_mov_b32 s19, s11
53; VI-NEXT:    s_mov_b32 s4, s6
54; VI-NEXT:    s_mov_b32 s5, s7
55; VI-NEXT:    s_mov_b32 s6, s10
56; VI-NEXT:    s_mov_b32 s7, s11
57; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
58; VI-NEXT:    s_waitcnt vmcnt(0)
59; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0 glc
60; VI-NEXT:    s_waitcnt vmcnt(0)
61; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 glc
62; VI-NEXT:    s_waitcnt vmcnt(0)
63; VI-NEXT:    s_mov_b32 s8, s0
64; VI-NEXT:    s_mov_b32 s9, s1
65; VI-NEXT:    v_max3_f32 v0, v0, v1, v2
66; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
67; VI-NEXT:    s_endpgm
68;
69; GFX9-LABEL: test_fmax3_olt_0_f32:
70; GFX9:       ; %bb.0:
71; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
72; GFX9-NEXT:    s_mov_b32 s3, 0xf000
73; GFX9-NEXT:    s_mov_b32 s2, -1
74; GFX9-NEXT:    s_mov_b32 s6, s2
75; GFX9-NEXT:    s_mov_b32 s7, s3
76; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
77; GFX9-NEXT:    s_mov_b32 s4, s10
78; GFX9-NEXT:    s_mov_b32 s5, s11
79; GFX9-NEXT:    s_mov_b32 s16, s12
80; GFX9-NEXT:    s_mov_b32 s17, s13
81; GFX9-NEXT:    s_mov_b32 s18, s2
82; GFX9-NEXT:    s_mov_b32 s19, s3
83; GFX9-NEXT:    s_mov_b32 s12, s14
84; GFX9-NEXT:    s_mov_b32 s13, s15
85; GFX9-NEXT:    s_mov_b32 s14, s2
86; GFX9-NEXT:    s_mov_b32 s15, s3
87; GFX9-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
88; GFX9-NEXT:    s_waitcnt vmcnt(0)
89; GFX9-NEXT:    buffer_load_dword v1, off, s[16:19], 0 glc
90; GFX9-NEXT:    s_waitcnt vmcnt(0)
91; GFX9-NEXT:    buffer_load_dword v2, off, s[12:15], 0 glc
92; GFX9-NEXT:    s_waitcnt vmcnt(0)
93; GFX9-NEXT:    s_mov_b32 s0, s8
94; GFX9-NEXT:    s_mov_b32 s1, s9
95; GFX9-NEXT:    v_max3_f32 v0, v0, v1, v2
96; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
97; GFX9-NEXT:    s_endpgm
98;
99; GFX11-LABEL: test_fmax3_olt_0_f32:
100; GFX11:       ; %bb.0:
101; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
102; GFX11-NEXT:    s_mov_b32 s10, -1
103; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
104; GFX11-NEXT:    s_mov_b32 s14, s10
105; GFX11-NEXT:    s_mov_b32 s15, s11
106; GFX11-NEXT:    s_mov_b32 s18, s10
107; GFX11-NEXT:    s_mov_b32 s19, s11
108; GFX11-NEXT:    s_mov_b32 s22, s10
109; GFX11-NEXT:    s_mov_b32 s23, s11
110; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
111; GFX11-NEXT:    s_mov_b32 s12, s2
112; GFX11-NEXT:    s_mov_b32 s13, s3
113; GFX11-NEXT:    s_mov_b32 s16, s4
114; GFX11-NEXT:    s_mov_b32 s17, s5
115; GFX11-NEXT:    s_mov_b32 s20, s6
116; GFX11-NEXT:    s_mov_b32 s21, s7
117; GFX11-NEXT:    buffer_load_b32 v0, off, s[12:15], 0 glc dlc
118; GFX11-NEXT:    s_waitcnt vmcnt(0)
119; GFX11-NEXT:    buffer_load_b32 v1, off, s[16:19], 0 glc dlc
120; GFX11-NEXT:    s_waitcnt vmcnt(0)
121; GFX11-NEXT:    buffer_load_b32 v2, off, s[20:23], 0 glc dlc
122; GFX11-NEXT:    s_waitcnt vmcnt(0)
123; GFX11-NEXT:    s_mov_b32 s8, s0
124; GFX11-NEXT:    s_mov_b32 s9, s1
125; GFX11-NEXT:    v_max3_f32 v0, v0, v1, v2
126; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
127; GFX11-NEXT:    s_endpgm
128;
129; GFX12-LABEL: test_fmax3_olt_0_f32:
130; GFX12:       ; %bb.0:
131; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
132; GFX12-NEXT:    s_mov_b32 s10, -1
133; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
134; GFX12-NEXT:    s_mov_b32 s14, s10
135; GFX12-NEXT:    s_mov_b32 s15, s11
136; GFX12-NEXT:    s_mov_b32 s18, s10
137; GFX12-NEXT:    s_mov_b32 s19, s11
138; GFX12-NEXT:    s_mov_b32 s22, s10
139; GFX12-NEXT:    s_mov_b32 s23, s11
140; GFX12-NEXT:    s_wait_kmcnt 0x0
141; GFX12-NEXT:    s_mov_b32 s12, s2
142; GFX12-NEXT:    s_mov_b32 s13, s3
143; GFX12-NEXT:    s_mov_b32 s16, s4
144; GFX12-NEXT:    s_mov_b32 s17, s5
145; GFX12-NEXT:    s_mov_b32 s20, s6
146; GFX12-NEXT:    s_mov_b32 s21, s7
147; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
148; GFX12-NEXT:    s_wait_loadcnt 0x0
149; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
150; GFX12-NEXT:    s_wait_loadcnt 0x0
151; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
152; GFX12-NEXT:    s_wait_loadcnt 0x0
153; GFX12-NEXT:    s_mov_b32 s8, s0
154; GFX12-NEXT:    s_mov_b32 s9, s1
155; GFX12-NEXT:    v_max3_num_f32 v0, v0, v1, v2
156; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
157; GFX12-NEXT:    s_endpgm
158  %a = load volatile  float, ptr addrspace(1) %aptr, align 4
159  %b = load volatile float, ptr addrspace(1) %bptr, align 4
160  %c = load volatile float, ptr addrspace(1) %cptr, align 4
161  %f0 = call float @llvm.maxnum.f32(float %a, float %b)
162  %f1 = call float @llvm.maxnum.f32(float %f0, float %c)
163  store float %f1, ptr addrspace(1) %out, align 4
164  ret void
165}
166
167; Commute operand of second fmax
168define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
169; SI-LABEL: test_fmax3_olt_1_f32:
170; SI:       ; %bb.0:
171; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
172; SI-NEXT:    s_mov_b32 s11, 0xf000
173; SI-NEXT:    s_mov_b32 s10, -1
174; SI-NEXT:    s_mov_b32 s14, s10
175; SI-NEXT:    s_mov_b32 s15, s11
176; SI-NEXT:    s_mov_b32 s18, s10
177; SI-NEXT:    s_mov_b32 s19, s11
178; SI-NEXT:    s_mov_b32 s22, s10
179; SI-NEXT:    s_mov_b32 s23, s11
180; SI-NEXT:    s_waitcnt lgkmcnt(0)
181; SI-NEXT:    s_mov_b32 s12, s2
182; SI-NEXT:    s_mov_b32 s13, s3
183; SI-NEXT:    s_mov_b32 s16, s4
184; SI-NEXT:    s_mov_b32 s17, s5
185; SI-NEXT:    s_mov_b32 s20, s6
186; SI-NEXT:    s_mov_b32 s21, s7
187; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
188; SI-NEXT:    s_waitcnt vmcnt(0)
189; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0 glc
190; SI-NEXT:    s_waitcnt vmcnt(0)
191; SI-NEXT:    buffer_load_dword v2, off, s[20:23], 0 glc
192; SI-NEXT:    s_waitcnt vmcnt(0)
193; SI-NEXT:    s_mov_b32 s8, s0
194; SI-NEXT:    s_mov_b32 s9, s1
195; SI-NEXT:    v_max3_f32 v0, v2, v0, v1
196; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
197; SI-NEXT:    s_endpgm
198;
199; VI-LABEL: test_fmax3_olt_1_f32:
200; VI:       ; %bb.0:
201; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
202; VI-NEXT:    s_mov_b32 s11, 0xf000
203; VI-NEXT:    s_mov_b32 s10, -1
204; VI-NEXT:    s_mov_b32 s14, s10
205; VI-NEXT:    s_mov_b32 s15, s11
206; VI-NEXT:    s_waitcnt lgkmcnt(0)
207; VI-NEXT:    s_mov_b32 s12, s2
208; VI-NEXT:    s_mov_b32 s13, s3
209; VI-NEXT:    s_mov_b32 s16, s4
210; VI-NEXT:    s_mov_b32 s17, s5
211; VI-NEXT:    s_mov_b32 s18, s10
212; VI-NEXT:    s_mov_b32 s19, s11
213; VI-NEXT:    s_mov_b32 s4, s6
214; VI-NEXT:    s_mov_b32 s5, s7
215; VI-NEXT:    s_mov_b32 s6, s10
216; VI-NEXT:    s_mov_b32 s7, s11
217; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
218; VI-NEXT:    s_waitcnt vmcnt(0)
219; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0 glc
220; VI-NEXT:    s_waitcnt vmcnt(0)
221; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 glc
222; VI-NEXT:    s_waitcnt vmcnt(0)
223; VI-NEXT:    s_mov_b32 s8, s0
224; VI-NEXT:    s_mov_b32 s9, s1
225; VI-NEXT:    v_max3_f32 v0, v2, v0, v1
226; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
227; VI-NEXT:    s_endpgm
228;
229; GFX9-LABEL: test_fmax3_olt_1_f32:
230; GFX9:       ; %bb.0:
231; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
232; GFX9-NEXT:    s_mov_b32 s3, 0xf000
233; GFX9-NEXT:    s_mov_b32 s2, -1
234; GFX9-NEXT:    s_mov_b32 s6, s2
235; GFX9-NEXT:    s_mov_b32 s7, s3
236; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
237; GFX9-NEXT:    s_mov_b32 s4, s10
238; GFX9-NEXT:    s_mov_b32 s5, s11
239; GFX9-NEXT:    s_mov_b32 s16, s12
240; GFX9-NEXT:    s_mov_b32 s17, s13
241; GFX9-NEXT:    s_mov_b32 s18, s2
242; GFX9-NEXT:    s_mov_b32 s19, s3
243; GFX9-NEXT:    s_mov_b32 s12, s14
244; GFX9-NEXT:    s_mov_b32 s13, s15
245; GFX9-NEXT:    s_mov_b32 s14, s2
246; GFX9-NEXT:    s_mov_b32 s15, s3
247; GFX9-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
248; GFX9-NEXT:    s_waitcnt vmcnt(0)
249; GFX9-NEXT:    buffer_load_dword v1, off, s[16:19], 0 glc
250; GFX9-NEXT:    s_waitcnt vmcnt(0)
251; GFX9-NEXT:    buffer_load_dword v2, off, s[12:15], 0 glc
252; GFX9-NEXT:    s_waitcnt vmcnt(0)
253; GFX9-NEXT:    s_mov_b32 s0, s8
254; GFX9-NEXT:    s_mov_b32 s1, s9
255; GFX9-NEXT:    v_max3_f32 v0, v2, v0, v1
256; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
257; GFX9-NEXT:    s_endpgm
258;
259; GFX11-LABEL: test_fmax3_olt_1_f32:
260; GFX11:       ; %bb.0:
261; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
262; GFX11-NEXT:    s_mov_b32 s10, -1
263; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
264; GFX11-NEXT:    s_mov_b32 s14, s10
265; GFX11-NEXT:    s_mov_b32 s15, s11
266; GFX11-NEXT:    s_mov_b32 s18, s10
267; GFX11-NEXT:    s_mov_b32 s19, s11
268; GFX11-NEXT:    s_mov_b32 s22, s10
269; GFX11-NEXT:    s_mov_b32 s23, s11
270; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
271; GFX11-NEXT:    s_mov_b32 s12, s2
272; GFX11-NEXT:    s_mov_b32 s13, s3
273; GFX11-NEXT:    s_mov_b32 s16, s4
274; GFX11-NEXT:    s_mov_b32 s17, s5
275; GFX11-NEXT:    s_mov_b32 s20, s6
276; GFX11-NEXT:    s_mov_b32 s21, s7
277; GFX11-NEXT:    buffer_load_b32 v0, off, s[12:15], 0 glc dlc
278; GFX11-NEXT:    s_waitcnt vmcnt(0)
279; GFX11-NEXT:    buffer_load_b32 v1, off, s[16:19], 0 glc dlc
280; GFX11-NEXT:    s_waitcnt vmcnt(0)
281; GFX11-NEXT:    buffer_load_b32 v2, off, s[20:23], 0 glc dlc
282; GFX11-NEXT:    s_waitcnt vmcnt(0)
283; GFX11-NEXT:    s_mov_b32 s8, s0
284; GFX11-NEXT:    s_mov_b32 s9, s1
285; GFX11-NEXT:    v_max3_f32 v0, v2, v0, v1
286; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
287; GFX11-NEXT:    s_endpgm
288;
289; GFX12-LABEL: test_fmax3_olt_1_f32:
290; GFX12:       ; %bb.0:
291; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
292; GFX12-NEXT:    s_mov_b32 s10, -1
293; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
294; GFX12-NEXT:    s_mov_b32 s14, s10
295; GFX12-NEXT:    s_mov_b32 s15, s11
296; GFX12-NEXT:    s_mov_b32 s18, s10
297; GFX12-NEXT:    s_mov_b32 s19, s11
298; GFX12-NEXT:    s_mov_b32 s22, s10
299; GFX12-NEXT:    s_mov_b32 s23, s11
300; GFX12-NEXT:    s_wait_kmcnt 0x0
301; GFX12-NEXT:    s_mov_b32 s12, s2
302; GFX12-NEXT:    s_mov_b32 s13, s3
303; GFX12-NEXT:    s_mov_b32 s16, s4
304; GFX12-NEXT:    s_mov_b32 s17, s5
305; GFX12-NEXT:    s_mov_b32 s20, s6
306; GFX12-NEXT:    s_mov_b32 s21, s7
307; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
308; GFX12-NEXT:    s_wait_loadcnt 0x0
309; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
310; GFX12-NEXT:    s_wait_loadcnt 0x0
311; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
312; GFX12-NEXT:    s_wait_loadcnt 0x0
313; GFX12-NEXT:    s_mov_b32 s8, s0
314; GFX12-NEXT:    s_mov_b32 s9, s1
315; GFX12-NEXT:    v_max3_num_f32 v0, v2, v0, v1
316; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
317; GFX12-NEXT:    s_endpgm
318  %a = load volatile float, ptr addrspace(1) %aptr, align 4
319  %b = load volatile float, ptr addrspace(1) %bptr, align 4
320  %c = load volatile float, ptr addrspace(1) %cptr, align 4
321  %f0 = call float @llvm.maxnum.f32(float %a, float %b)
322  %f1 = call float @llvm.maxnum.f32(float %c, float %f0)
323  store float %f1, ptr addrspace(1) %out, align 4
324  ret void
325}
326
327define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
328; SI-LABEL: test_fmax3_olt_0_f16:
329; SI:       ; %bb.0:
330; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
331; SI-NEXT:    s_mov_b32 s11, 0xf000
332; SI-NEXT:    s_mov_b32 s10, -1
333; SI-NEXT:    s_mov_b32 s14, s10
334; SI-NEXT:    s_mov_b32 s15, s11
335; SI-NEXT:    s_mov_b32 s18, s10
336; SI-NEXT:    s_mov_b32 s19, s11
337; SI-NEXT:    s_mov_b32 s22, s10
338; SI-NEXT:    s_mov_b32 s23, s11
339; SI-NEXT:    s_waitcnt lgkmcnt(0)
340; SI-NEXT:    s_mov_b32 s12, s2
341; SI-NEXT:    s_mov_b32 s13, s3
342; SI-NEXT:    s_mov_b32 s16, s4
343; SI-NEXT:    s_mov_b32 s17, s5
344; SI-NEXT:    s_mov_b32 s20, s6
345; SI-NEXT:    s_mov_b32 s21, s7
346; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
347; SI-NEXT:    s_waitcnt vmcnt(0)
348; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
349; SI-NEXT:    s_waitcnt vmcnt(0)
350; SI-NEXT:    buffer_load_ushort v2, off, s[20:23], 0 glc
351; SI-NEXT:    s_waitcnt vmcnt(0)
352; SI-NEXT:    s_mov_b32 s8, s0
353; SI-NEXT:    s_mov_b32 s9, s1
354; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
355; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
356; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
357; SI-NEXT:    v_max3_f32 v0, v0, v1, v2
358; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
359; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
360; SI-NEXT:    s_endpgm
361;
362; VI-LABEL: test_fmax3_olt_0_f16:
363; VI:       ; %bb.0:
364; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
365; VI-NEXT:    s_mov_b32 s11, 0xf000
366; VI-NEXT:    s_mov_b32 s10, -1
367; VI-NEXT:    s_mov_b32 s14, s10
368; VI-NEXT:    s_mov_b32 s15, s11
369; VI-NEXT:    s_waitcnt lgkmcnt(0)
370; VI-NEXT:    s_mov_b32 s12, s2
371; VI-NEXT:    s_mov_b32 s13, s3
372; VI-NEXT:    s_mov_b32 s16, s4
373; VI-NEXT:    s_mov_b32 s17, s5
374; VI-NEXT:    s_mov_b32 s18, s10
375; VI-NEXT:    s_mov_b32 s19, s11
376; VI-NEXT:    s_mov_b32 s4, s6
377; VI-NEXT:    s_mov_b32 s5, s7
378; VI-NEXT:    s_mov_b32 s6, s10
379; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
380; VI-NEXT:    s_waitcnt vmcnt(0)
381; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
382; VI-NEXT:    s_waitcnt vmcnt(0)
383; VI-NEXT:    s_mov_b32 s7, s11
384; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
385; VI-NEXT:    s_waitcnt vmcnt(0)
386; VI-NEXT:    s_mov_b32 s8, s0
387; VI-NEXT:    s_mov_b32 s9, s1
388; VI-NEXT:    v_max_f16_e32 v0, v0, v0
389; VI-NEXT:    v_max_f16_e32 v1, v1, v1
390; VI-NEXT:    v_max_f16_e32 v0, v0, v1
391; VI-NEXT:    v_max_f16_e32 v1, v2, v2
392; VI-NEXT:    v_max_f16_e32 v0, v0, v1
393; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
394; VI-NEXT:    s_endpgm
395;
396; GFX9-LABEL: test_fmax3_olt_0_f16:
397; GFX9:       ; %bb.0:
398; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
399; GFX9-NEXT:    s_mov_b32 s3, 0xf000
400; GFX9-NEXT:    s_mov_b32 s2, -1
401; GFX9-NEXT:    s_mov_b32 s6, s2
402; GFX9-NEXT:    s_mov_b32 s7, s3
403; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
404; GFX9-NEXT:    s_mov_b32 s4, s10
405; GFX9-NEXT:    s_mov_b32 s5, s11
406; GFX9-NEXT:    s_mov_b32 s16, s12
407; GFX9-NEXT:    s_mov_b32 s17, s13
408; GFX9-NEXT:    s_mov_b32 s18, s2
409; GFX9-NEXT:    s_mov_b32 s19, s3
410; GFX9-NEXT:    s_mov_b32 s12, s14
411; GFX9-NEXT:    s_mov_b32 s13, s15
412; GFX9-NEXT:    s_mov_b32 s14, s2
413; GFX9-NEXT:    s_mov_b32 s15, s3
414; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0 glc
415; GFX9-NEXT:    s_waitcnt vmcnt(0)
416; GFX9-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
417; GFX9-NEXT:    s_waitcnt vmcnt(0)
418; GFX9-NEXT:    buffer_load_ushort v2, off, s[12:15], 0 glc
419; GFX9-NEXT:    s_waitcnt vmcnt(0)
420; GFX9-NEXT:    s_mov_b32 s0, s8
421; GFX9-NEXT:    s_mov_b32 s1, s9
422; GFX9-NEXT:    v_max3_f16 v0, v0, v1, v2
423; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
424; GFX9-NEXT:    s_endpgm
425;
426; GFX11-LABEL: test_fmax3_olt_0_f16:
427; GFX11:       ; %bb.0:
428; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
429; GFX11-NEXT:    s_mov_b32 s10, -1
430; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
431; GFX11-NEXT:    s_mov_b32 s14, s10
432; GFX11-NEXT:    s_mov_b32 s15, s11
433; GFX11-NEXT:    s_mov_b32 s18, s10
434; GFX11-NEXT:    s_mov_b32 s19, s11
435; GFX11-NEXT:    s_mov_b32 s22, s10
436; GFX11-NEXT:    s_mov_b32 s23, s11
437; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
438; GFX11-NEXT:    s_mov_b32 s12, s2
439; GFX11-NEXT:    s_mov_b32 s13, s3
440; GFX11-NEXT:    s_mov_b32 s16, s4
441; GFX11-NEXT:    s_mov_b32 s17, s5
442; GFX11-NEXT:    s_mov_b32 s20, s6
443; GFX11-NEXT:    s_mov_b32 s21, s7
444; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
445; GFX11-NEXT:    s_waitcnt vmcnt(0)
446; GFX11-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
447; GFX11-NEXT:    s_waitcnt vmcnt(0)
448; GFX11-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
449; GFX11-NEXT:    s_waitcnt vmcnt(0)
450; GFX11-NEXT:    s_mov_b32 s8, s0
451; GFX11-NEXT:    s_mov_b32 s9, s1
452; GFX11-NEXT:    v_max3_f16 v0, v0, v1, v2
453; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
454; GFX11-NEXT:    s_endpgm
455;
456; GFX12-LABEL: test_fmax3_olt_0_f16:
457; GFX12:       ; %bb.0:
458; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
459; GFX12-NEXT:    s_mov_b32 s10, -1
460; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
461; GFX12-NEXT:    s_mov_b32 s14, s10
462; GFX12-NEXT:    s_mov_b32 s15, s11
463; GFX12-NEXT:    s_mov_b32 s18, s10
464; GFX12-NEXT:    s_mov_b32 s19, s11
465; GFX12-NEXT:    s_mov_b32 s22, s10
466; GFX12-NEXT:    s_mov_b32 s23, s11
467; GFX12-NEXT:    s_wait_kmcnt 0x0
468; GFX12-NEXT:    s_mov_b32 s12, s2
469; GFX12-NEXT:    s_mov_b32 s13, s3
470; GFX12-NEXT:    s_mov_b32 s16, s4
471; GFX12-NEXT:    s_mov_b32 s17, s5
472; GFX12-NEXT:    s_mov_b32 s20, s6
473; GFX12-NEXT:    s_mov_b32 s21, s7
474; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
475; GFX12-NEXT:    s_wait_loadcnt 0x0
476; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
477; GFX12-NEXT:    s_wait_loadcnt 0x0
478; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
479; GFX12-NEXT:    s_wait_loadcnt 0x0
480; GFX12-NEXT:    s_mov_b32 s8, s0
481; GFX12-NEXT:    s_mov_b32 s9, s1
482; GFX12-NEXT:    v_max3_num_f16 v0, v0, v1, v2
483; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
484; GFX12-NEXT:    s_endpgm
485  %a = load volatile half, ptr addrspace(1) %aptr, align 2
486  %b = load volatile half, ptr addrspace(1) %bptr, align 2
487  %c = load volatile half, ptr addrspace(1) %cptr, align 2
488  %f0 = call half @llvm.maxnum.f16(half %a, half %b)
489  %f1 = call half @llvm.maxnum.f16(half %f0, half %c)
490  store half %f1, ptr addrspace(1) %out, align 2
491  ret void
492}
493
494; Commute operand of second fmax
495define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
496; SI-LABEL: test_fmax3_olt_1_f16:
497; SI:       ; %bb.0:
498; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
499; SI-NEXT:    s_mov_b32 s11, 0xf000
500; SI-NEXT:    s_mov_b32 s10, -1
501; SI-NEXT:    s_mov_b32 s14, s10
502; SI-NEXT:    s_mov_b32 s15, s11
503; SI-NEXT:    s_mov_b32 s18, s10
504; SI-NEXT:    s_mov_b32 s19, s11
505; SI-NEXT:    s_mov_b32 s22, s10
506; SI-NEXT:    s_mov_b32 s23, s11
507; SI-NEXT:    s_waitcnt lgkmcnt(0)
508; SI-NEXT:    s_mov_b32 s12, s2
509; SI-NEXT:    s_mov_b32 s13, s3
510; SI-NEXT:    s_mov_b32 s16, s4
511; SI-NEXT:    s_mov_b32 s17, s5
512; SI-NEXT:    s_mov_b32 s20, s6
513; SI-NEXT:    s_mov_b32 s21, s7
514; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
515; SI-NEXT:    s_waitcnt vmcnt(0)
516; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
517; SI-NEXT:    s_waitcnt vmcnt(0)
518; SI-NEXT:    buffer_load_ushort v2, off, s[20:23], 0 glc
519; SI-NEXT:    s_waitcnt vmcnt(0)
520; SI-NEXT:    s_mov_b32 s8, s0
521; SI-NEXT:    s_mov_b32 s9, s1
522; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
523; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
524; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
525; SI-NEXT:    v_max3_f32 v0, v2, v0, v1
526; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
527; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
528; SI-NEXT:    s_endpgm
529;
530; VI-LABEL: test_fmax3_olt_1_f16:
531; VI:       ; %bb.0:
532; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
533; VI-NEXT:    s_mov_b32 s11, 0xf000
534; VI-NEXT:    s_mov_b32 s10, -1
535; VI-NEXT:    s_mov_b32 s14, s10
536; VI-NEXT:    s_mov_b32 s15, s11
537; VI-NEXT:    s_waitcnt lgkmcnt(0)
538; VI-NEXT:    s_mov_b32 s12, s2
539; VI-NEXT:    s_mov_b32 s13, s3
540; VI-NEXT:    s_mov_b32 s16, s4
541; VI-NEXT:    s_mov_b32 s17, s5
542; VI-NEXT:    s_mov_b32 s18, s10
543; VI-NEXT:    s_mov_b32 s19, s11
544; VI-NEXT:    s_mov_b32 s4, s6
545; VI-NEXT:    s_mov_b32 s5, s7
546; VI-NEXT:    s_mov_b32 s6, s10
547; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
548; VI-NEXT:    s_waitcnt vmcnt(0)
549; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
550; VI-NEXT:    s_waitcnt vmcnt(0)
551; VI-NEXT:    s_mov_b32 s7, s11
552; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 glc
553; VI-NEXT:    s_waitcnt vmcnt(0)
554; VI-NEXT:    s_mov_b32 s8, s0
555; VI-NEXT:    s_mov_b32 s9, s1
556; VI-NEXT:    v_max_f16_e32 v0, v0, v0
557; VI-NEXT:    v_max_f16_e32 v1, v1, v1
558; VI-NEXT:    v_max_f16_e32 v0, v0, v1
559; VI-NEXT:    v_max_f16_e32 v1, v2, v2
560; VI-NEXT:    v_max_f16_e32 v0, v1, v0
561; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
562; VI-NEXT:    s_endpgm
563;
564; GFX9-LABEL: test_fmax3_olt_1_f16:
565; GFX9:       ; %bb.0:
566; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
567; GFX9-NEXT:    s_mov_b32 s3, 0xf000
568; GFX9-NEXT:    s_mov_b32 s2, -1
569; GFX9-NEXT:    s_mov_b32 s6, s2
570; GFX9-NEXT:    s_mov_b32 s7, s3
571; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
572; GFX9-NEXT:    s_mov_b32 s4, s10
573; GFX9-NEXT:    s_mov_b32 s5, s11
574; GFX9-NEXT:    s_mov_b32 s16, s12
575; GFX9-NEXT:    s_mov_b32 s17, s13
576; GFX9-NEXT:    s_mov_b32 s18, s2
577; GFX9-NEXT:    s_mov_b32 s19, s3
578; GFX9-NEXT:    s_mov_b32 s12, s14
579; GFX9-NEXT:    s_mov_b32 s13, s15
580; GFX9-NEXT:    s_mov_b32 s14, s2
581; GFX9-NEXT:    s_mov_b32 s15, s3
582; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0 glc
583; GFX9-NEXT:    s_waitcnt vmcnt(0)
584; GFX9-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 glc
585; GFX9-NEXT:    s_waitcnt vmcnt(0)
586; GFX9-NEXT:    buffer_load_ushort v2, off, s[12:15], 0 glc
587; GFX9-NEXT:    s_waitcnt vmcnt(0)
588; GFX9-NEXT:    s_mov_b32 s0, s8
589; GFX9-NEXT:    s_mov_b32 s1, s9
590; GFX9-NEXT:    v_max3_f16 v0, v2, v0, v1
591; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
592; GFX9-NEXT:    s_endpgm
593;
594; GFX11-LABEL: test_fmax3_olt_1_f16:
595; GFX11:       ; %bb.0:
596; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
597; GFX11-NEXT:    s_mov_b32 s10, -1
598; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
599; GFX11-NEXT:    s_mov_b32 s14, s10
600; GFX11-NEXT:    s_mov_b32 s15, s11
601; GFX11-NEXT:    s_mov_b32 s18, s10
602; GFX11-NEXT:    s_mov_b32 s19, s11
603; GFX11-NEXT:    s_mov_b32 s22, s10
604; GFX11-NEXT:    s_mov_b32 s23, s11
605; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
606; GFX11-NEXT:    s_mov_b32 s12, s2
607; GFX11-NEXT:    s_mov_b32 s13, s3
608; GFX11-NEXT:    s_mov_b32 s16, s4
609; GFX11-NEXT:    s_mov_b32 s17, s5
610; GFX11-NEXT:    s_mov_b32 s20, s6
611; GFX11-NEXT:    s_mov_b32 s21, s7
612; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
613; GFX11-NEXT:    s_waitcnt vmcnt(0)
614; GFX11-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
615; GFX11-NEXT:    s_waitcnt vmcnt(0)
616; GFX11-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
617; GFX11-NEXT:    s_waitcnt vmcnt(0)
618; GFX11-NEXT:    s_mov_b32 s8, s0
619; GFX11-NEXT:    s_mov_b32 s9, s1
620; GFX11-NEXT:    v_max3_f16 v0, v2, v0, v1
621; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
622; GFX11-NEXT:    s_endpgm
623;
624; GFX12-LABEL: test_fmax3_olt_1_f16:
625; GFX12:       ; %bb.0:
626; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
627; GFX12-NEXT:    s_mov_b32 s10, -1
628; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
629; GFX12-NEXT:    s_mov_b32 s14, s10
630; GFX12-NEXT:    s_mov_b32 s15, s11
631; GFX12-NEXT:    s_mov_b32 s18, s10
632; GFX12-NEXT:    s_mov_b32 s19, s11
633; GFX12-NEXT:    s_mov_b32 s22, s10
634; GFX12-NEXT:    s_mov_b32 s23, s11
635; GFX12-NEXT:    s_wait_kmcnt 0x0
636; GFX12-NEXT:    s_mov_b32 s12, s2
637; GFX12-NEXT:    s_mov_b32 s13, s3
638; GFX12-NEXT:    s_mov_b32 s16, s4
639; GFX12-NEXT:    s_mov_b32 s17, s5
640; GFX12-NEXT:    s_mov_b32 s20, s6
641; GFX12-NEXT:    s_mov_b32 s21, s7
642; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
643; GFX12-NEXT:    s_wait_loadcnt 0x0
644; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
645; GFX12-NEXT:    s_wait_loadcnt 0x0
646; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
647; GFX12-NEXT:    s_wait_loadcnt 0x0
648; GFX12-NEXT:    s_mov_b32 s8, s0
649; GFX12-NEXT:    s_mov_b32 s9, s1
650; GFX12-NEXT:    v_max3_num_f16 v0, v2, v0, v1
651; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
652; GFX12-NEXT:    s_endpgm
653  %a = load volatile half, ptr addrspace(1) %aptr, align 2
654  %b = load volatile half, ptr addrspace(1) %bptr, align 2
655  %c = load volatile half, ptr addrspace(1) %cptr, align 2
656  %f0 = call half @llvm.maxnum.f16(half %a, half %b)
657  %f1 = call half @llvm.maxnum.f16(half %c, half %f0)
658  store half %f1, ptr addrspace(1) %out, align 2
659  ret void
660}
661
662; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of max3
663; since there are no pack instructions for fmax3.
664define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 {
665; SI-LABEL: no_fmax3_v2f16:
666; SI:       ; %bb.0: ; %entry
667; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
668; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
669; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
670; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
671; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
672; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
673; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
674; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
675; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
676; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
677; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
678; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
679; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
680; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
681; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
682; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
683; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
684; SI-NEXT:    v_max_f32_e32 v1, v1, v3
685; SI-NEXT:    v_max_f32_e32 v0, v0, v2
686; SI-NEXT:    v_max3_f32 v0, v4, v0, v6
687; SI-NEXT:    v_max3_f32 v1, v5, v1, v7
688; SI-NEXT:    s_setpc_b64 s[30:31]
689;
690; VI-LABEL: no_fmax3_v2f16:
691; VI:       ; %bb.0: ; %entry
692; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
693; VI-NEXT:    v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
694; VI-NEXT:    v_max_f16_e32 v0, v0, v1
695; VI-NEXT:    v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
696; VI-NEXT:    v_max_f16_e32 v0, v2, v0
697; VI-NEXT:    v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
698; VI-NEXT:    v_max_f16_e32 v0, v0, v3
699; VI-NEXT:    v_or_b32_e32 v0, v0, v1
700; VI-NEXT:    s_setpc_b64 s[30:31]
701;
702; GFX9-LABEL: no_fmax3_v2f16:
703; GFX9:       ; %bb.0: ; %entry
704; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
705; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
706; GFX9-NEXT:    v_pk_max_f16 v0, v2, v0
707; GFX9-NEXT:    v_pk_max_f16 v0, v0, v3
708; GFX9-NEXT:    s_setpc_b64 s[30:31]
709;
710; GFX11-LABEL: no_fmax3_v2f16:
711; GFX11:       ; %bb.0: ; %entry
712; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
713; GFX11-NEXT:    v_pk_max_f16 v0, v0, v1
714; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
715; GFX11-NEXT:    v_pk_max_f16 v0, v2, v0
716; GFX11-NEXT:    v_pk_max_f16 v0, v0, v3
717; GFX11-NEXT:    s_setpc_b64 s[30:31]
718;
719; GFX12-LABEL: no_fmax3_v2f16:
720; GFX12:       ; %bb.0: ; %entry
721; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
722; GFX12-NEXT:    s_wait_expcnt 0x0
723; GFX12-NEXT:    s_wait_samplecnt 0x0
724; GFX12-NEXT:    s_wait_bvhcnt 0x0
725; GFX12-NEXT:    s_wait_kmcnt 0x0
726; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v1
727; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
728; GFX12-NEXT:    v_pk_max_num_f16 v0, v2, v0
729; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v3
730; GFX12-NEXT:    s_setpc_b64 s[30:31]
731entry:
732  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
733  %max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
734  %res = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %max1, <2 x half> %d)
735  ret <2 x half> %res
736}
737
738declare i32 @llvm.amdgcn.workitem.id.x() #1
739declare float @llvm.maxnum.f32(float, float) #1
740declare half @llvm.maxnum.f16(half, half) #1
741declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)
742
743attributes #0 = { nounwind }
744attributes #1 = { nounwind readnone speculatable }
745attributes #2 = { nounwind "no-nans-fp-math"="true" }
746