xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (revision ec66c4af09263e68d800971906e60afc27d54a06)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
4; RUN: llc -mtriple=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
7
8declare half @llvm.fabs.f16(half) #0
9declare half @llvm.canonicalize.f16(half) #0
10declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0
11declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
12declare <3 x half> @llvm.canonicalize.v3f16(<3 x half>) #0
13declare <4 x half> @llvm.canonicalize.v4f16(<4 x half>) #0
14declare <6 x half> @llvm.canonicalize.v6f16(<6 x half>) #0
15declare <8 x half> @llvm.canonicalize.v8f16(<8 x half>) #0
16declare <12 x half> @llvm.canonicalize.v12f16(<12 x half>) #0
17declare <16 x half> @llvm.canonicalize.v16f16(<16 x half>) #0
18declare <32 x half> @llvm.canonicalize.v32f16(<32 x half>) #0
19declare <64 x half> @llvm.canonicalize.v64f16(<64 x half>) #0
20declare i32 @llvm.amdgcn.workitem.id.x() #0
21
22define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(1) %out) #1 {
23; VI-LABEL: test_fold_canonicalize_undef_value_f16:
24; VI:       ; %bb.0:
25; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
26; VI-NEXT:    v_mov_b32_e32 v2, 0
27; VI-NEXT:    s_waitcnt lgkmcnt(0)
28; VI-NEXT:    v_mov_b32_e32 v0, s0
29; VI-NEXT:    v_mov_b32_e32 v1, s1
30; VI-NEXT:    flat_store_short v[0:1], v2
31; VI-NEXT:    s_endpgm
32;
33; GFX9-LABEL: test_fold_canonicalize_undef_value_f16:
34; GFX9:       ; %bb.0:
35; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
36; GFX9-NEXT:    v_mov_b32_e32 v0, 0
37; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
38; GFX9-NEXT:    global_store_short v0, v0, s[0:1]
39; GFX9-NEXT:    s_endpgm
40;
41; CI-LABEL: test_fold_canonicalize_undef_value_f16:
42; CI:       ; %bb.0:
43; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
44; CI-NEXT:    s_mov_b32 s3, 0xf000
45; CI-NEXT:    s_mov_b32 s2, -1
46; CI-NEXT:    v_mov_b32_e32 v0, 0
47; CI-NEXT:    s_waitcnt lgkmcnt(0)
48; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
49; CI-NEXT:    s_endpgm
50;
51; GFX11-LABEL: test_fold_canonicalize_undef_value_f16:
52; GFX11:       ; %bb.0:
53; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
54; GFX11-NEXT:    v_mov_b32_e32 v0, 0
55; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
56; GFX11-NEXT:    global_store_b16 v0, v0, s[0:1]
57; GFX11-NEXT:    s_endpgm
58  %canonicalized = call half @llvm.canonicalize.f16(half undef)
59  store half %canonicalized, ptr addrspace(1) %out
60  ret void
61}
62
63define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 {
64; VI-LABEL: v_test_canonicalize_var_f16:
65; VI:       ; %bb.0:
66; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
67; VI-NEXT:    s_waitcnt lgkmcnt(0)
68; VI-NEXT:    v_mov_b32_e32 v0, s0
69; VI-NEXT:    v_mov_b32_e32 v1, s1
70; VI-NEXT:    flat_load_ushort v0, v[0:1]
71; VI-NEXT:    s_waitcnt vmcnt(0)
72; VI-NEXT:    v_max_f16_e32 v0, v0, v0
73; VI-NEXT:    flat_store_short v[0:1], v0
74; VI-NEXT:    s_endpgm
75;
76; GFX9-LABEL: v_test_canonicalize_var_f16:
77; GFX9:       ; %bb.0:
78; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
79; GFX9-NEXT:    v_mov_b32_e32 v0, 0
80; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
81; GFX9-NEXT:    global_load_ushort v0, v0, s[0:1]
82; GFX9-NEXT:    s_waitcnt vmcnt(0)
83; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
84; GFX9-NEXT:    global_store_short v[0:1], v0, off
85; GFX9-NEXT:    s_endpgm
86;
87; CI-LABEL: v_test_canonicalize_var_f16:
88; CI:       ; %bb.0:
89; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
90; CI-NEXT:    s_mov_b32 s3, 0xf000
91; CI-NEXT:    s_mov_b32 s2, -1
92; CI-NEXT:    s_waitcnt lgkmcnt(0)
93; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
94; CI-NEXT:    s_waitcnt vmcnt(0)
95; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
96; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
97; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
98; CI-NEXT:    s_endpgm
99;
100; GFX11-TRUE16-LABEL: v_test_canonicalize_var_f16:
101; GFX11-TRUE16:       ; %bb.0:
102; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
103; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
104; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
105; GFX11-TRUE16-NEXT:    global_load_u16 v0, v0, s[0:1]
106; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
107; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
108; GFX11-TRUE16-NEXT:    global_store_b16 v[0:1], v0, off
109; GFX11-TRUE16-NEXT:    s_endpgm
110;
111; GFX11-FAKE16-LABEL: v_test_canonicalize_var_f16:
112; GFX11-FAKE16:       ; %bb.0:
113; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
114; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
115; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
116; GFX11-FAKE16-NEXT:    global_load_u16 v0, v0, s[0:1]
117; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
118; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
119; GFX11-FAKE16-NEXT:    global_store_b16 v[0:1], v0, off
120; GFX11-FAKE16-NEXT:    s_endpgm
121  %val = load half, ptr addrspace(1) %out
122  %canonicalized = call half @llvm.canonicalize.f16(half %val)
123  store half %canonicalized, ptr addrspace(1) undef
124  ret void
125}
126
127define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i16 zeroext %val.arg) #1 {
128; VI-LABEL: s_test_canonicalize_var_f16:
129; VI:       ; %bb.0:
130; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
131; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
132; VI-NEXT:    s_waitcnt lgkmcnt(0)
133; VI-NEXT:    v_max_f16_e64 v2, s2, s2
134; VI-NEXT:    v_mov_b32_e32 v0, s0
135; VI-NEXT:    v_mov_b32_e32 v1, s1
136; VI-NEXT:    flat_store_short v[0:1], v2
137; VI-NEXT:    s_endpgm
138;
139; GFX9-LABEL: s_test_canonicalize_var_f16:
140; GFX9:       ; %bb.0:
141; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
142; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
143; GFX9-NEXT:    v_mov_b32_e32 v0, 0
144; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
145; GFX9-NEXT:    v_max_f16_e64 v1, s2, s2
146; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
147; GFX9-NEXT:    s_endpgm
148;
149; CI-LABEL: s_test_canonicalize_var_f16:
150; CI:       ; %bb.0:
151; CI-NEXT:    s_load_dword s0, s[4:5], 0xb
152; CI-NEXT:    s_mov_b32 s3, 0xf000
153; CI-NEXT:    s_mov_b32 s2, -1
154; CI-NEXT:    s_waitcnt lgkmcnt(0)
155; CI-NEXT:    v_cvt_f32_f16_e32 v0, s0
156; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
157; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
158; CI-NEXT:    s_waitcnt lgkmcnt(0)
159; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
160; CI-NEXT:    s_endpgm
161;
162; GFX11-TRUE16-LABEL: s_test_canonicalize_var_f16:
163; GFX11-TRUE16:       ; %bb.0:
164; GFX11-TRUE16-NEXT:    s_clause 0x1
165; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
166; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
167; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
168; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
169; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
170; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
171; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
172; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
173; GFX11-TRUE16-NEXT:    s_endpgm
174;
175; GFX11-FAKE16-LABEL: s_test_canonicalize_var_f16:
176; GFX11-FAKE16:       ; %bb.0:
177; GFX11-FAKE16-NEXT:    s_clause 0x1
178; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
179; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
180; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
181; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
182; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, s2, s2
183; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
184; GFX11-FAKE16-NEXT:    s_endpgm
185  %val = bitcast i16 %val.arg to half
186  %canonicalized = call half @llvm.canonicalize.f16(half %val)
187  store half %canonicalized, ptr addrspace(1) %out
188  ret void
189}
190
191define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 {
192; VI-LABEL: v_test_canonicalize_build_vector_v2f16:
193; VI:       ; %bb.0:
194; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
195; VI-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
196; VI-NEXT:    v_max_f16_e32 v0, v0, v0
197; VI-NEXT:    v_or_b32_e32 v0, v0, v1
198; VI-NEXT:    s_setpc_b64 s[30:31]
199;
200; GFX9-LABEL: v_test_canonicalize_build_vector_v2f16:
201; GFX9:       ; %bb.0:
202; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
203; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
204; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
205; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
206; GFX9-NEXT:    s_setpc_b64 s[30:31]
207;
208; CI-LABEL: v_test_canonicalize_build_vector_v2f16:
209; CI:       ; %bb.0:
210; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
211; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
212; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
213; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
214; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
215; CI-NEXT:    s_setpc_b64 s[30:31]
216;
217; GFX11-LABEL: v_test_canonicalize_build_vector_v2f16:
218; GFX11:       ; %bb.0:
219; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
220; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
221; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
222; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
223; GFX11-NEXT:    s_setpc_b64 s[30:31]
224  %ins0 = insertelement <2 x half> undef, half %lo, i32 0
225  %ins1 = insertelement <2 x half> %ins0, half %hi, i32 1
226  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1)
227  ret <2 x half> %canonicalized
228}
229
230define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %out) #1 {
231; VI-LABEL: v_test_canonicalize_fabs_var_f16:
232; VI:       ; %bb.0:
233; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
234; VI-NEXT:    s_waitcnt lgkmcnt(0)
235; VI-NEXT:    v_mov_b32_e32 v0, s0
236; VI-NEXT:    v_mov_b32_e32 v1, s1
237; VI-NEXT:    flat_load_ushort v2, v[0:1]
238; VI-NEXT:    s_waitcnt vmcnt(0)
239; VI-NEXT:    v_max_f16_e64 v2, |v2|, |v2|
240; VI-NEXT:    flat_store_short v[0:1], v2
241; VI-NEXT:    s_endpgm
242;
243; GFX9-LABEL: v_test_canonicalize_fabs_var_f16:
244; GFX9:       ; %bb.0:
245; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
246; GFX9-NEXT:    v_mov_b32_e32 v0, 0
247; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
248; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1]
249; GFX9-NEXT:    s_waitcnt vmcnt(0)
250; GFX9-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
251; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
252; GFX9-NEXT:    s_endpgm
253;
254; CI-LABEL: v_test_canonicalize_fabs_var_f16:
255; CI:       ; %bb.0:
256; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
257; CI-NEXT:    s_mov_b32 s3, 0xf000
258; CI-NEXT:    s_mov_b32 s2, -1
259; CI-NEXT:    s_waitcnt lgkmcnt(0)
260; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
261; CI-NEXT:    s_waitcnt vmcnt(0)
262; CI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
263; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
264; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
265; CI-NEXT:    s_endpgm
266;
267; GFX11-TRUE16-LABEL: v_test_canonicalize_fabs_var_f16:
268; GFX11-TRUE16:       ; %bb.0:
269; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
270; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
271; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
272; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
273; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
274; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, |v0.l|, |v0.l|
275; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
276; GFX11-TRUE16-NEXT:    s_endpgm
277;
278; GFX11-FAKE16-LABEL: v_test_canonicalize_fabs_var_f16:
279; GFX11-FAKE16:       ; %bb.0:
280; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
281; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
282; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
283; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1]
284; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
285; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
286; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
287; GFX11-FAKE16-NEXT:    s_endpgm
288  %val = load half, ptr addrspace(1) %out
289  %val.fabs = call half @llvm.fabs.f16(half %val)
290  %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs)
291  store half %canonicalized, ptr addrspace(1) %out
292  ret void
293}
294
295define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #1 {
296; VI-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
297; VI:       ; %bb.0:
298; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
299; VI-NEXT:    s_waitcnt lgkmcnt(0)
300; VI-NEXT:    v_mov_b32_e32 v0, s0
301; VI-NEXT:    v_mov_b32_e32 v1, s1
302; VI-NEXT:    flat_load_ushort v2, v[0:1]
303; VI-NEXT:    s_waitcnt vmcnt(0)
304; VI-NEXT:    v_max_f16_e64 v2, -|v2|, -|v2|
305; VI-NEXT:    flat_store_short v[0:1], v2
306; VI-NEXT:    s_endpgm
307;
308; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
309; GFX9:       ; %bb.0:
310; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
311; GFX9-NEXT:    v_mov_b32_e32 v0, 0
312; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
313; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1]
314; GFX9-NEXT:    s_waitcnt vmcnt(0)
315; GFX9-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
316; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
317; GFX9-NEXT:    s_endpgm
318;
319; CI-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
320; CI:       ; %bb.0:
321; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
322; CI-NEXT:    s_mov_b32 s3, 0xf000
323; CI-NEXT:    s_mov_b32 s2, -1
324; CI-NEXT:    s_waitcnt lgkmcnt(0)
325; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
326; CI-NEXT:    s_waitcnt vmcnt(0)
327; CI-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
328; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
329; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
330; CI-NEXT:    s_endpgm
331;
332; GFX11-TRUE16-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
333; GFX11-TRUE16:       ; %bb.0:
334; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
335; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
336; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
337; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
338; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
339; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, -|v0.l|, -|v0.l|
340; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
341; GFX11-TRUE16-NEXT:    s_endpgm
342;
343; GFX11-FAKE16-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
344; GFX11-FAKE16:       ; %bb.0:
345; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
346; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
347; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
348; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1]
349; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
350; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
351; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
352; GFX11-FAKE16-NEXT:    s_endpgm
353  %val = load half, ptr addrspace(1) %out
354  %val.fabs = call half @llvm.fabs.f16(half %val)
355  %val.fabs.fneg = fneg half %val.fabs
356  %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs.fneg)
357  store half %canonicalized, ptr addrspace(1) %out
358  ret void
359}
360
361define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #1 {
362; VI-LABEL: v_test_canonicalize_fneg_var_f16:
363; VI:       ; %bb.0:
364; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
365; VI-NEXT:    s_waitcnt lgkmcnt(0)
366; VI-NEXT:    v_mov_b32_e32 v0, s0
367; VI-NEXT:    v_mov_b32_e32 v1, s1
368; VI-NEXT:    flat_load_ushort v2, v[0:1]
369; VI-NEXT:    s_waitcnt vmcnt(0)
370; VI-NEXT:    v_max_f16_e64 v2, -v2, -v2
371; VI-NEXT:    flat_store_short v[0:1], v2
372; VI-NEXT:    s_endpgm
373;
374; GFX9-LABEL: v_test_canonicalize_fneg_var_f16:
375; GFX9:       ; %bb.0:
376; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
377; GFX9-NEXT:    v_mov_b32_e32 v0, 0
378; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
379; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1]
380; GFX9-NEXT:    s_waitcnt vmcnt(0)
381; GFX9-NEXT:    v_max_f16_e64 v1, -v1, -v1
382; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
383; GFX9-NEXT:    s_endpgm
384;
385; CI-LABEL: v_test_canonicalize_fneg_var_f16:
386; CI:       ; %bb.0:
387; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
388; CI-NEXT:    s_mov_b32 s3, 0xf000
389; CI-NEXT:    s_mov_b32 s2, -1
390; CI-NEXT:    s_waitcnt lgkmcnt(0)
391; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
392; CI-NEXT:    s_waitcnt vmcnt(0)
393; CI-NEXT:    v_cvt_f32_f16_e64 v0, -v0
394; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
395; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
396; CI-NEXT:    s_endpgm
397;
398; GFX11-TRUE16-LABEL: v_test_canonicalize_fneg_var_f16:
399; GFX11-TRUE16:       ; %bb.0:
400; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
401; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
402; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
403; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
404; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
405; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, -v0.l, -v0.l
406; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
407; GFX11-TRUE16-NEXT:    s_endpgm
408;
409; GFX11-FAKE16-LABEL: v_test_canonicalize_fneg_var_f16:
410; GFX11-FAKE16:       ; %bb.0:
411; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
412; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
413; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
414; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1]
415; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
416; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, -v1, -v1
417; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
418; GFX11-FAKE16-NEXT:    s_endpgm
419  %val = load half, ptr addrspace(1) %out
420  %val.fneg = fneg half %val
421  %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg)
422  store half %canonicalized, ptr addrspace(1) %out
423  ret void
424}
425
426define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #2 {
427; VI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
428; VI:       ; %bb.0:
429; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
430; VI-NEXT:    s_waitcnt lgkmcnt(0)
431; VI-NEXT:    v_mov_b32_e32 v0, s0
432; VI-NEXT:    v_mov_b32_e32 v1, s1
433; VI-NEXT:    flat_load_ushort v2, v[0:1]
434; VI-NEXT:    s_waitcnt vmcnt(0)
435; VI-NEXT:    v_mul_f16_e32 v2, -1.0, v2
436; VI-NEXT:    flat_store_short v[0:1], v2
437; VI-NEXT:    s_endpgm
438;
439; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
440; GFX9:       ; %bb.0:
441; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
442; GFX9-NEXT:    v_mov_b32_e32 v0, 0
443; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
444; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1]
445; GFX9-NEXT:    s_waitcnt vmcnt(0)
446; GFX9-NEXT:    v_max_f16_e64 v1, -v1, -v1
447; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
448; GFX9-NEXT:    s_endpgm
449;
450; CI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
451; CI:       ; %bb.0:
452; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
453; CI-NEXT:    s_mov_b32 s3, 0xf000
454; CI-NEXT:    s_mov_b32 s2, -1
455; CI-NEXT:    s_waitcnt lgkmcnt(0)
456; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
457; CI-NEXT:    s_waitcnt vmcnt(0)
458; CI-NEXT:    v_cvt_f32_f16_e64 v0, -v0
459; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
460; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
461; CI-NEXT:    s_endpgm
462;
463; GFX11-TRUE16-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
464; GFX11-TRUE16:       ; %bb.0:
465; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
466; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
467; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
468; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
469; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
470; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, -v0.l, -v0.l
471; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
472; GFX11-TRUE16-NEXT:    s_endpgm
473;
474; GFX11-FAKE16-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
475; GFX11-FAKE16:       ; %bb.0:
476; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
477; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
478; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
479; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1]
480; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
481; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, -v1, -v1
482; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
483; GFX11-FAKE16-NEXT:    s_endpgm
484  %val = load half, ptr addrspace(1) %out
485  %val.fneg = fneg half %val
486  %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg)
487  store half %canonicalized, ptr addrspace(1) %out
488  ret void
489}
490
491define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #2 {
492; VI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
493; VI:       ; %bb.0:
494; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
495; VI-NEXT:    s_waitcnt lgkmcnt(0)
496; VI-NEXT:    v_mov_b32_e32 v0, s0
497; VI-NEXT:    v_mov_b32_e32 v1, s1
498; VI-NEXT:    flat_load_ushort v2, v[0:1]
499; VI-NEXT:    s_waitcnt vmcnt(0)
500; VI-NEXT:    v_mul_f16_e64 v2, -1.0, |v2|
501; VI-NEXT:    flat_store_short v[0:1], v2
502; VI-NEXT:    s_endpgm
503;
504; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
505; GFX9:       ; %bb.0:
506; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
507; GFX9-NEXT:    v_mov_b32_e32 v0, 0
508; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
509; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1]
510; GFX9-NEXT:    s_waitcnt vmcnt(0)
511; GFX9-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
512; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
513; GFX9-NEXT:    s_endpgm
514;
515; CI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
516; CI:       ; %bb.0:
517; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
518; CI-NEXT:    s_mov_b32 s3, 0xf000
519; CI-NEXT:    s_mov_b32 s2, -1
520; CI-NEXT:    s_waitcnt lgkmcnt(0)
521; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
522; CI-NEXT:    s_waitcnt vmcnt(0)
523; CI-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
524; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
525; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
526; CI-NEXT:    s_endpgm
527;
528; GFX11-TRUE16-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
529; GFX11-TRUE16:       ; %bb.0:
530; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
531; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
532; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
533; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
534; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
535; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, -|v0.l|, -|v0.l|
536; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
537; GFX11-TRUE16-NEXT:    s_endpgm
538;
539; GFX11-FAKE16-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
540; GFX11-FAKE16:       ; %bb.0:
541; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
542; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
543; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
544; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1]
545; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
546; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
547; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
548; GFX11-FAKE16-NEXT:    s_endpgm
549  %val = load half, ptr addrspace(1) %out
550  %val.fabs = call half @llvm.fabs.f16(half %val)
551  %val.fabs.fneg = fneg half %val.fabs
552  %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs.fneg)
553  store half %canonicalized, ptr addrspace(1) %out
554  ret void
555}
556
557define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) #1 {
558; VI-LABEL: test_fold_canonicalize_p0_f16:
559; VI:       ; %bb.0:
560; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
561; VI-NEXT:    v_mov_b32_e32 v2, 0
562; VI-NEXT:    s_waitcnt lgkmcnt(0)
563; VI-NEXT:    v_mov_b32_e32 v0, s0
564; VI-NEXT:    v_mov_b32_e32 v1, s1
565; VI-NEXT:    flat_store_short v[0:1], v2
566; VI-NEXT:    s_endpgm
567;
568; GFX9-LABEL: test_fold_canonicalize_p0_f16:
569; GFX9:       ; %bb.0:
570; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
571; GFX9-NEXT:    v_mov_b32_e32 v0, 0
572; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
573; GFX9-NEXT:    global_store_short v0, v0, s[0:1]
574; GFX9-NEXT:    s_endpgm
575;
576; CI-LABEL: test_fold_canonicalize_p0_f16:
577; CI:       ; %bb.0:
578; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
579; CI-NEXT:    s_mov_b32 s3, 0xf000
580; CI-NEXT:    s_mov_b32 s2, -1
581; CI-NEXT:    v_mov_b32_e32 v0, 0
582; CI-NEXT:    s_waitcnt lgkmcnt(0)
583; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
584; CI-NEXT:    s_endpgm
585;
586; GFX11-LABEL: test_fold_canonicalize_p0_f16:
587; GFX11:       ; %bb.0:
588; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
589; GFX11-NEXT:    v_mov_b32_e32 v0, 0
590; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
591; GFX11-NEXT:    global_store_b16 v0, v0, s[0:1]
592; GFX11-NEXT:    s_endpgm
593  %canonicalized = call half @llvm.canonicalize.f16(half 0.0)
594  store half %canonicalized, ptr addrspace(1) %out
595  ret void
596}
597
598define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) #1 {
599; VI-LABEL: test_fold_canonicalize_n0_f16:
600; VI:       ; %bb.0:
601; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
602; VI-NEXT:    v_mov_b32_e32 v2, 0xffff8000
603; VI-NEXT:    s_waitcnt lgkmcnt(0)
604; VI-NEXT:    v_mov_b32_e32 v0, s0
605; VI-NEXT:    v_mov_b32_e32 v1, s1
606; VI-NEXT:    flat_store_short v[0:1], v2
607; VI-NEXT:    s_endpgm
608;
609; GFX9-LABEL: test_fold_canonicalize_n0_f16:
610; GFX9:       ; %bb.0:
611; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
612; GFX9-NEXT:    v_mov_b32_e32 v0, 0
613; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff8000
614; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
615; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
616; GFX9-NEXT:    s_endpgm
617;
618; CI-LABEL: test_fold_canonicalize_n0_f16:
619; CI:       ; %bb.0:
620; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
621; CI-NEXT:    s_mov_b32 s3, 0xf000
622; CI-NEXT:    s_mov_b32 s2, -1
623; CI-NEXT:    v_mov_b32_e32 v0, 0x8000
624; CI-NEXT:    s_waitcnt lgkmcnt(0)
625; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
626; CI-NEXT:    s_endpgm
627;
628; GFX11-LABEL: test_fold_canonicalize_n0_f16:
629; GFX11:       ; %bb.0:
630; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
631; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000
632; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
633; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
634; GFX11-NEXT:    s_endpgm
635  %canonicalized = call half @llvm.canonicalize.f16(half -0.0)
636  store half %canonicalized, ptr addrspace(1) %out
637  ret void
638}
639
640define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) #1 {
641; VI-LABEL: test_fold_canonicalize_p1_f16:
642; VI:       ; %bb.0:
643; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
644; VI-NEXT:    v_mov_b32_e32 v2, 0x3c00
645; VI-NEXT:    s_waitcnt lgkmcnt(0)
646; VI-NEXT:    v_mov_b32_e32 v0, s0
647; VI-NEXT:    v_mov_b32_e32 v1, s1
648; VI-NEXT:    flat_store_short v[0:1], v2
649; VI-NEXT:    s_endpgm
650;
651; GFX9-LABEL: test_fold_canonicalize_p1_f16:
652; GFX9:       ; %bb.0:
653; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
654; GFX9-NEXT:    v_mov_b32_e32 v0, 0
655; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3c00
656; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
657; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
658; GFX9-NEXT:    s_endpgm
659;
660; CI-LABEL: test_fold_canonicalize_p1_f16:
661; CI:       ; %bb.0:
662; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
663; CI-NEXT:    s_mov_b32 s3, 0xf000
664; CI-NEXT:    s_mov_b32 s2, -1
665; CI-NEXT:    v_mov_b32_e32 v0, 0x3c00
666; CI-NEXT:    s_waitcnt lgkmcnt(0)
667; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
668; CI-NEXT:    s_endpgm
669;
670; GFX11-LABEL: test_fold_canonicalize_p1_f16:
671; GFX11:       ; %bb.0:
672; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
673; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c00
674; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
675; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
676; GFX11-NEXT:    s_endpgm
677  %canonicalized = call half @llvm.canonicalize.f16(half 1.0)
678  store half %canonicalized, ptr addrspace(1) %out
679  ret void
680}
681
682define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) #1 {
683; VI-LABEL: test_fold_canonicalize_n1_f16:
684; VI:       ; %bb.0:
685; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
686; VI-NEXT:    v_mov_b32_e32 v2, 0xffffbc00
687; VI-NEXT:    s_waitcnt lgkmcnt(0)
688; VI-NEXT:    v_mov_b32_e32 v0, s0
689; VI-NEXT:    v_mov_b32_e32 v1, s1
690; VI-NEXT:    flat_store_short v[0:1], v2
691; VI-NEXT:    s_endpgm
692;
693; GFX9-LABEL: test_fold_canonicalize_n1_f16:
694; GFX9:       ; %bb.0:
695; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
696; GFX9-NEXT:    v_mov_b32_e32 v0, 0
697; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffffbc00
698; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
699; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
700; GFX9-NEXT:    s_endpgm
701;
702; CI-LABEL: test_fold_canonicalize_n1_f16:
703; CI:       ; %bb.0:
704; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
705; CI-NEXT:    s_mov_b32 s3, 0xf000
706; CI-NEXT:    s_mov_b32 s2, -1
707; CI-NEXT:    v_mov_b32_e32 v0, 0xbc00
708; CI-NEXT:    s_waitcnt lgkmcnt(0)
709; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
710; CI-NEXT:    s_endpgm
711;
712; GFX11-LABEL: test_fold_canonicalize_n1_f16:
713; GFX11:       ; %bb.0:
714; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
715; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbc00
716; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
717; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
718; GFX11-NEXT:    s_endpgm
719  %canonicalized = call half @llvm.canonicalize.f16(half -1.0)
720  store half %canonicalized, ptr addrspace(1) %out
721  ret void
722}
723
724define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) %out) #1 {
725; VI-LABEL: test_fold_canonicalize_literal_f16:
726; VI:       ; %bb.0:
727; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
728; VI-NEXT:    v_mov_b32_e32 v2, 0x4c00
729; VI-NEXT:    s_waitcnt lgkmcnt(0)
730; VI-NEXT:    v_mov_b32_e32 v0, s0
731; VI-NEXT:    v_mov_b32_e32 v1, s1
732; VI-NEXT:    flat_store_short v[0:1], v2
733; VI-NEXT:    s_endpgm
734;
735; GFX9-LABEL: test_fold_canonicalize_literal_f16:
736; GFX9:       ; %bb.0:
737; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
738; GFX9-NEXT:    v_mov_b32_e32 v0, 0
739; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4c00
740; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
741; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
742; GFX9-NEXT:    s_endpgm
743;
744; CI-LABEL: test_fold_canonicalize_literal_f16:
745; CI:       ; %bb.0:
746; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
747; CI-NEXT:    s_mov_b32 s3, 0xf000
748; CI-NEXT:    s_mov_b32 s2, -1
749; CI-NEXT:    v_mov_b32_e32 v0, 0x4c00
750; CI-NEXT:    s_waitcnt lgkmcnt(0)
751; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
752; CI-NEXT:    s_endpgm
753;
754; GFX11-LABEL: test_fold_canonicalize_literal_f16:
755; GFX11:       ; %bb.0:
756; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
757; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c00
758; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
759; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
760; GFX11-NEXT:    s_endpgm
761  %canonicalized = call half @llvm.canonicalize.f16(half 16.0)
762  store half %canonicalized, ptr addrspace(1) %out
763  ret void
764}
765
766define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #1 {
767; VI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
768; VI:       ; %bb.0:
769; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
770; VI-NEXT:    v_mov_b32_e32 v2, 0x3ff
771; VI-NEXT:    s_waitcnt lgkmcnt(0)
772; VI-NEXT:    v_mov_b32_e32 v0, s0
773; VI-NEXT:    v_mov_b32_e32 v1, s1
774; VI-NEXT:    flat_store_short v[0:1], v2
775; VI-NEXT:    s_endpgm
776;
777; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
778; GFX9:       ; %bb.0:
779; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
780; GFX9-NEXT:    v_mov_b32_e32 v0, 0
781; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3ff
782; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
783; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
784; GFX9-NEXT:    s_endpgm
785;
786; CI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
787; CI:       ; %bb.0:
788; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
789; CI-NEXT:    s_mov_b32 s3, 0xf000
790; CI-NEXT:    s_mov_b32 s2, -1
791; CI-NEXT:    v_mov_b32_e32 v0, 0x3ff
792; CI-NEXT:    s_waitcnt lgkmcnt(0)
793; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
794; CI-NEXT:    s_endpgm
795;
796; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
797; GFX11:       ; %bb.0:
798; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
799; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
800; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
801; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
802; GFX11-NEXT:    s_endpgm
803  %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
804  store half %canonicalized, ptr addrspace(1) %out
805  ret void
806}
807
808define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #3 {
809; VI-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
810; VI:       ; %bb.0:
811; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
812; VI-NEXT:    v_mov_b32_e32 v2, 0x3ff
813; VI-NEXT:    s_waitcnt lgkmcnt(0)
814; VI-NEXT:    v_mov_b32_e32 v0, s0
815; VI-NEXT:    v_mov_b32_e32 v1, s1
816; VI-NEXT:    flat_store_short v[0:1], v2
817; VI-NEXT:    s_endpgm
818;
819; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
820; GFX9:       ; %bb.0:
821; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
822; GFX9-NEXT:    v_mov_b32_e32 v0, 0
823; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3ff
824; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
825; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
826; GFX9-NEXT:    s_endpgm
827;
828; CI-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
829; CI:       ; %bb.0:
830; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
831; CI-NEXT:    s_mov_b32 s3, 0xf000
832; CI-NEXT:    s_mov_b32 s2, -1
833; CI-NEXT:    v_mov_b32_e32 v0, 0x3ff
834; CI-NEXT:    s_waitcnt lgkmcnt(0)
835; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
836; CI-NEXT:    s_endpgm
837;
838; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
839; GFX11:       ; %bb.0:
840; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
841; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
842; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
843; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
844; GFX11-NEXT:    s_endpgm
845  %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
846  store half %canonicalized, ptr addrspace(1) %out
847  ret void
848}
849
850define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #1 {
851; VI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
852; VI:       ; %bb.0:
853; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
854; VI-NEXT:    v_mov_b32_e32 v2, 0xffff83ff
855; VI-NEXT:    s_waitcnt lgkmcnt(0)
856; VI-NEXT:    v_mov_b32_e32 v0, s0
857; VI-NEXT:    v_mov_b32_e32 v1, s1
858; VI-NEXT:    flat_store_short v[0:1], v2
859; VI-NEXT:    s_endpgm
860;
861; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
862; GFX9:       ; %bb.0:
863; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
864; GFX9-NEXT:    v_mov_b32_e32 v0, 0
865; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff83ff
866; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
867; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
868; GFX9-NEXT:    s_endpgm
869;
870; CI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
871; CI:       ; %bb.0:
872; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
873; CI-NEXT:    s_mov_b32 s3, 0xf000
874; CI-NEXT:    s_mov_b32 s2, -1
875; CI-NEXT:    v_mov_b32_e32 v0, 0x83ff
876; CI-NEXT:    s_waitcnt lgkmcnt(0)
877; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
878; CI-NEXT:    s_endpgm
879;
880; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
881; GFX11:       ; %bb.0:
882; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
883; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
884; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
885; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
886; GFX11-NEXT:    s_endpgm
887  %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
888  store half %canonicalized, ptr addrspace(1) %out
889  ret void
890}
891
892define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #3 {
893; VI-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
894; VI:       ; %bb.0:
895; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
896; VI-NEXT:    v_mov_b32_e32 v2, 0xffff83ff
897; VI-NEXT:    s_waitcnt lgkmcnt(0)
898; VI-NEXT:    v_mov_b32_e32 v0, s0
899; VI-NEXT:    v_mov_b32_e32 v1, s1
900; VI-NEXT:    flat_store_short v[0:1], v2
901; VI-NEXT:    s_endpgm
902;
903; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
904; GFX9:       ; %bb.0:
905; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
906; GFX9-NEXT:    v_mov_b32_e32 v0, 0
907; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff83ff
908; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
909; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
910; GFX9-NEXT:    s_endpgm
911;
912; CI-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
913; CI:       ; %bb.0:
914; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
915; CI-NEXT:    s_mov_b32 s3, 0xf000
916; CI-NEXT:    s_mov_b32 s2, -1
917; CI-NEXT:    v_mov_b32_e32 v0, 0x83ff
918; CI-NEXT:    s_waitcnt lgkmcnt(0)
919; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
920; CI-NEXT:    s_endpgm
921;
922; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
923; GFX11:       ; %bb.0:
924; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
925; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
926; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
927; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
928; GFX11-NEXT:    s_endpgm
929  %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
930  store half %canonicalized, ptr addrspace(1) %out
931  ret void
932}
933
934define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out) #1 {
935; VI-LABEL: test_fold_canonicalize_qnan_f16:
936; VI:       ; %bb.0:
937; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
938; VI-NEXT:    v_mov_b32_e32 v2, 0x7c00
939; VI-NEXT:    s_waitcnt lgkmcnt(0)
940; VI-NEXT:    v_mov_b32_e32 v0, s0
941; VI-NEXT:    v_mov_b32_e32 v1, s1
942; VI-NEXT:    flat_store_short v[0:1], v2
943; VI-NEXT:    s_endpgm
944;
945; GFX9-LABEL: test_fold_canonicalize_qnan_f16:
946; GFX9:       ; %bb.0:
947; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
948; GFX9-NEXT:    v_mov_b32_e32 v0, 0
949; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7c00
950; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
951; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
952; GFX9-NEXT:    s_endpgm
953;
954; CI-LABEL: test_fold_canonicalize_qnan_f16:
955; CI:       ; %bb.0:
956; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
957; CI-NEXT:    s_mov_b32 s3, 0xf000
958; CI-NEXT:    s_mov_b32 s2, -1
959; CI-NEXT:    v_mov_b32_e32 v0, 0x7c00
960; CI-NEXT:    s_waitcnt lgkmcnt(0)
961; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
962; CI-NEXT:    s_endpgm
963;
964; GFX11-LABEL: test_fold_canonicalize_qnan_f16:
965; GFX11:       ; %bb.0:
966; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
967; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00
968; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
969; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
970; GFX11-NEXT:    s_endpgm
971  %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00)
972  store half %canonicalized, ptr addrspace(1) %out
973  ret void
974}
975
976define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrspace(1) %out) #1 {
977; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
978; VI:       ; %bb.0:
979; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
980; VI-NEXT:    v_mov_b32_e32 v2, 0x7e00
981; VI-NEXT:    s_waitcnt lgkmcnt(0)
982; VI-NEXT:    v_mov_b32_e32 v0, s0
983; VI-NEXT:    v_mov_b32_e32 v1, s1
984; VI-NEXT:    flat_store_short v[0:1], v2
985; VI-NEXT:    s_endpgm
986;
987; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
988; GFX9:       ; %bb.0:
989; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
990; GFX9-NEXT:    v_mov_b32_e32 v0, 0
991; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e00
992; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
993; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
994; GFX9-NEXT:    s_endpgm
995;
996; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
997; CI:       ; %bb.0:
998; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
999; CI-NEXT:    s_mov_b32 s3, 0xf000
1000; CI-NEXT:    s_mov_b32 s2, -1
1001; CI-NEXT:    v_mov_b32_e32 v0, 0x7e00
1002; CI-NEXT:    s_waitcnt lgkmcnt(0)
1003; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1004; CI-NEXT:    s_endpgm
1005;
1006; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
1007; GFX11:       ; %bb.0:
1008; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1009; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
1010; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1011; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
1012; GFX11-NEXT:    s_endpgm
1013  %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half))
1014  store half %canonicalized, ptr addrspace(1) %out
1015  ret void
1016}
1017
1018define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrspace(1) %out) #1 {
1019; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
1020; VI:       ; %bb.0:
1021; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1022; VI-NEXT:    v_mov_b32_e32 v2, 0x7e00
1023; VI-NEXT:    s_waitcnt lgkmcnt(0)
1024; VI-NEXT:    v_mov_b32_e32 v0, s0
1025; VI-NEXT:    v_mov_b32_e32 v1, s1
1026; VI-NEXT:    flat_store_short v[0:1], v2
1027; VI-NEXT:    s_endpgm
1028;
1029; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
1030; GFX9:       ; %bb.0:
1031; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1032; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1033; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e00
1034; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1035; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
1036; GFX9-NEXT:    s_endpgm
1037;
1038; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
1039; CI:       ; %bb.0:
1040; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1041; CI-NEXT:    s_mov_b32 s3, 0xf000
1042; CI-NEXT:    s_mov_b32 s2, -1
1043; CI-NEXT:    v_mov_b32_e32 v0, 0x7e00
1044; CI-NEXT:    s_waitcnt lgkmcnt(0)
1045; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1046; CI-NEXT:    s_endpgm
1047;
1048; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
1049; GFX11:       ; %bb.0:
1050; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1051; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
1052; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1053; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
1054; GFX11-NEXT:    s_endpgm
1055  %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half))
1056  store half %canonicalized, ptr addrspace(1) %out
1057  ret void
1058}
1059
1060define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace(1) %out) #1 {
1061; VI-LABEL: test_fold_canonicalize_snan0_value_f16:
1062; VI:       ; %bb.0:
1063; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1064; VI-NEXT:    v_mov_b32_e32 v2, 0x7e00
1065; VI-NEXT:    s_waitcnt lgkmcnt(0)
1066; VI-NEXT:    v_mov_b32_e32 v0, s0
1067; VI-NEXT:    v_mov_b32_e32 v1, s1
1068; VI-NEXT:    flat_store_short v[0:1], v2
1069; VI-NEXT:    s_endpgm
1070;
1071; GFX9-LABEL: test_fold_canonicalize_snan0_value_f16:
1072; GFX9:       ; %bb.0:
1073; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1074; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1075; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e00
1076; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1077; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
1078; GFX9-NEXT:    s_endpgm
1079;
1080; CI-LABEL: test_fold_canonicalize_snan0_value_f16:
1081; CI:       ; %bb.0:
1082; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1083; CI-NEXT:    s_mov_b32 s3, 0xf000
1084; CI-NEXT:    s_mov_b32 s2, -1
1085; CI-NEXT:    v_mov_b32_e32 v0, 0x7e00
1086; CI-NEXT:    s_waitcnt lgkmcnt(0)
1087; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1088; CI-NEXT:    s_endpgm
1089;
1090; GFX11-LABEL: test_fold_canonicalize_snan0_value_f16:
1091; GFX11:       ; %bb.0:
1092; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1093; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
1094; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1095; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
1096; GFX11-NEXT:    s_endpgm
1097  %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01)
1098  store half %canonicalized, ptr addrspace(1) %out
1099  ret void
1100}
1101
1102define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace(1) %out) #1 {
1103; VI-LABEL: test_fold_canonicalize_snan1_value_f16:
1104; VI:       ; %bb.0:
1105; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1106; VI-NEXT:    v_mov_b32_e32 v2, 0x7e00
1107; VI-NEXT:    s_waitcnt lgkmcnt(0)
1108; VI-NEXT:    v_mov_b32_e32 v0, s0
1109; VI-NEXT:    v_mov_b32_e32 v1, s1
1110; VI-NEXT:    flat_store_short v[0:1], v2
1111; VI-NEXT:    s_endpgm
1112;
1113; GFX9-LABEL: test_fold_canonicalize_snan1_value_f16:
1114; GFX9:       ; %bb.0:
1115; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1116; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1117; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e00
1118; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1119; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
1120; GFX9-NEXT:    s_endpgm
1121;
1122; CI-LABEL: test_fold_canonicalize_snan1_value_f16:
1123; CI:       ; %bb.0:
1124; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1125; CI-NEXT:    s_mov_b32 s3, 0xf000
1126; CI-NEXT:    s_mov_b32 s2, -1
1127; CI-NEXT:    v_mov_b32_e32 v0, 0x7e00
1128; CI-NEXT:    s_waitcnt lgkmcnt(0)
1129; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1130; CI-NEXT:    s_endpgm
1131;
1132; GFX11-LABEL: test_fold_canonicalize_snan1_value_f16:
1133; GFX11:       ; %bb.0:
1134; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1135; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
1136; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1137; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
1138; GFX11-NEXT:    s_endpgm
1139  %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF)
1140  store half %canonicalized, ptr addrspace(1) %out
1141  ret void
1142}
1143
1144define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace(1) %out) #1 {
1145; VI-LABEL: test_fold_canonicalize_snan2_value_f16:
1146; VI:       ; %bb.0:
1147; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1148; VI-NEXT:    v_mov_b32_e32 v2, 0x7e00
1149; VI-NEXT:    s_waitcnt lgkmcnt(0)
1150; VI-NEXT:    v_mov_b32_e32 v0, s0
1151; VI-NEXT:    v_mov_b32_e32 v1, s1
1152; VI-NEXT:    flat_store_short v[0:1], v2
1153; VI-NEXT:    s_endpgm
1154;
1155; GFX9-LABEL: test_fold_canonicalize_snan2_value_f16:
1156; GFX9:       ; %bb.0:
1157; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1158; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1159; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e00
1160; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1161; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
1162; GFX9-NEXT:    s_endpgm
1163;
1164; CI-LABEL: test_fold_canonicalize_snan2_value_f16:
1165; CI:       ; %bb.0:
1166; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1167; CI-NEXT:    s_mov_b32 s3, 0xf000
1168; CI-NEXT:    s_mov_b32 s2, -1
1169; CI-NEXT:    v_mov_b32_e32 v0, 0x7e00
1170; CI-NEXT:    s_waitcnt lgkmcnt(0)
1171; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1172; CI-NEXT:    s_endpgm
1173;
1174; GFX11-LABEL: test_fold_canonicalize_snan2_value_f16:
1175; GFX11:       ; %bb.0:
1176; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1177; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
1178; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1179; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
1180; GFX11-NEXT:    s_endpgm
1181  %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF)
1182  store half %canonicalized, ptr addrspace(1) %out
1183  ret void
1184}
1185
1186define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace(1) %out) #1 {
1187; VI-LABEL: test_fold_canonicalize_snan3_value_f16:
1188; VI:       ; %bb.0:
1189; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1190; VI-NEXT:    v_mov_b32_e32 v2, 0x7e00
1191; VI-NEXT:    s_waitcnt lgkmcnt(0)
1192; VI-NEXT:    v_mov_b32_e32 v0, s0
1193; VI-NEXT:    v_mov_b32_e32 v1, s1
1194; VI-NEXT:    flat_store_short v[0:1], v2
1195; VI-NEXT:    s_endpgm
1196;
1197; GFX9-LABEL: test_fold_canonicalize_snan3_value_f16:
1198; GFX9:       ; %bb.0:
1199; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1200; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1201; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e00
1202; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1203; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
1204; GFX9-NEXT:    s_endpgm
1205;
1206; CI-LABEL: test_fold_canonicalize_snan3_value_f16:
1207; CI:       ; %bb.0:
1208; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1209; CI-NEXT:    s_mov_b32 s3, 0xf000
1210; CI-NEXT:    s_mov_b32 s2, -1
1211; CI-NEXT:    v_mov_b32_e32 v0, 0x7e00
1212; CI-NEXT:    s_waitcnt lgkmcnt(0)
1213; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1214; CI-NEXT:    s_endpgm
1215;
1216; GFX11-LABEL: test_fold_canonicalize_snan3_value_f16:
1217; GFX11:       ; %bb.0:
1218; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1219; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
1220; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1221; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
1222; GFX11-NEXT:    s_endpgm
1223  %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01)
1224  store half %canonicalized, ptr addrspace(1) %out
1225  ret void
1226}
1227
1228define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) #1 {
1229; VI-LABEL: v_test_canonicalize_var_v2f16:
1230; VI:       ; %bb.0:
1231; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1232; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1233; VI-NEXT:    s_waitcnt lgkmcnt(0)
1234; VI-NEXT:    v_mov_b32_e32 v1, s1
1235; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1236; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1237; VI-NEXT:    flat_load_dword v0, v[0:1]
1238; VI-NEXT:    s_waitcnt vmcnt(0)
1239; VI-NEXT:    v_max_f16_sdwa v1, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1240; VI-NEXT:    v_max_f16_e32 v0, v0, v0
1241; VI-NEXT:    v_or_b32_e32 v2, v0, v1
1242; VI-NEXT:    v_mov_b32_e32 v0, s0
1243; VI-NEXT:    v_mov_b32_e32 v1, s1
1244; VI-NEXT:    flat_store_dword v[0:1], v2
1245; VI-NEXT:    s_endpgm
1246;
1247; GFX9-LABEL: v_test_canonicalize_var_v2f16:
1248; GFX9:       ; %bb.0:
1249; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1250; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1251; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1252; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1253; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
1254; GFX9-NEXT:    s_waitcnt vmcnt(0)
1255; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
1256; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
1257; GFX9-NEXT:    s_endpgm
1258;
1259; CI-LABEL: v_test_canonicalize_var_v2f16:
1260; CI:       ; %bb.0:
1261; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1262; CI-NEXT:    s_mov_b32 s3, 0xf000
1263; CI-NEXT:    s_mov_b32 s6, 0
1264; CI-NEXT:    s_mov_b32 s7, s3
1265; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1266; CI-NEXT:    s_waitcnt lgkmcnt(0)
1267; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
1268; CI-NEXT:    v_mov_b32_e32 v1, 0
1269; CI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1270; CI-NEXT:    s_mov_b32 s2, -1
1271; CI-NEXT:    s_waitcnt vmcnt(0)
1272; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1273; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1274; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1275; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1276; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1277; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1278; CI-NEXT:    v_or_b32_e32 v0, v0, v1
1279; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1280; CI-NEXT:    s_endpgm
1281;
1282; GFX11-LABEL: v_test_canonicalize_var_v2f16:
1283; GFX11:       ; %bb.0:
1284; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1285; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1286; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1287; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1288; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1289; GFX11-NEXT:    global_load_b32 v0, v0, s[0:1]
1290; GFX11-NEXT:    s_waitcnt vmcnt(0)
1291; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
1292; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
1293; GFX11-NEXT:    s_endpgm
1294  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1295  %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
1296  %val = load <2 x half>, ptr addrspace(1) %gep
1297  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
1298  store <2 x half> %canonicalized, ptr addrspace(1) %out
1299  ret void
1300}
1301
1302define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %out) #1 {
1303; VI-LABEL: v_test_canonicalize_fabs_var_v2f16:
1304; VI:       ; %bb.0:
1305; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1306; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1307; VI-NEXT:    s_waitcnt lgkmcnt(0)
1308; VI-NEXT:    v_mov_b32_e32 v1, s1
1309; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1310; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1311; VI-NEXT:    flat_load_dword v0, v[0:1]
1312; VI-NEXT:    s_waitcnt vmcnt(0)
1313; VI-NEXT:    v_max_f16_sdwa v1, |v0|, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1314; VI-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
1315; VI-NEXT:    v_or_b32_e32 v2, v0, v1
1316; VI-NEXT:    v_mov_b32_e32 v0, s0
1317; VI-NEXT:    v_mov_b32_e32 v1, s1
1318; VI-NEXT:    flat_store_dword v[0:1], v2
1319; VI-NEXT:    s_endpgm
1320;
1321; GFX9-LABEL: v_test_canonicalize_fabs_var_v2f16:
1322; GFX9:       ; %bb.0:
1323; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1324; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1325; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1326; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1327; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
1328; GFX9-NEXT:    s_waitcnt vmcnt(0)
1329; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
1330; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
1331; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
1332; GFX9-NEXT:    s_endpgm
1333;
1334; CI-LABEL: v_test_canonicalize_fabs_var_v2f16:
1335; CI:       ; %bb.0:
1336; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1337; CI-NEXT:    s_mov_b32 s3, 0xf000
1338; CI-NEXT:    s_mov_b32 s6, 0
1339; CI-NEXT:    s_mov_b32 s7, s3
1340; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1341; CI-NEXT:    s_waitcnt lgkmcnt(0)
1342; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
1343; CI-NEXT:    v_mov_b32_e32 v1, 0
1344; CI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1345; CI-NEXT:    s_mov_b32 s2, -1
1346; CI-NEXT:    s_waitcnt vmcnt(0)
1347; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1348; CI-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
1349; CI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
1350; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1351; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1352; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1353; CI-NEXT:    v_or_b32_e32 v0, v0, v1
1354; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1355; CI-NEXT:    s_endpgm
1356;
1357; GFX11-LABEL: v_test_canonicalize_fabs_var_v2f16:
1358; GFX11:       ; %bb.0:
1359; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1360; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1361; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
1362; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1363; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1364; GFX11-NEXT:    global_load_b32 v0, v0, s[0:1]
1365; GFX11-NEXT:    s_waitcnt vmcnt(0)
1366; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
1367; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
1368; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
1369; GFX11-NEXT:    s_endpgm
1370  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1371  %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
1372  %val = load <2 x half>, ptr addrspace(1) %gep
1373  %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
1374  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs)
1375  store <2 x half> %canonicalized, ptr addrspace(1) %out
1376  ret void
1377}
1378
1379define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace(1) %out) #1 {
1380; VI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
1381; VI:       ; %bb.0:
1382; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1383; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1384; VI-NEXT:    s_waitcnt lgkmcnt(0)
1385; VI-NEXT:    v_mov_b32_e32 v1, s1
1386; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1387; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1388; VI-NEXT:    flat_load_dword v0, v[0:1]
1389; VI-NEXT:    s_waitcnt vmcnt(0)
1390; VI-NEXT:    v_max_f16_sdwa v1, -|v0|, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1391; VI-NEXT:    v_max_f16_e64 v0, -|v0|, -|v0|
1392; VI-NEXT:    v_or_b32_e32 v2, v0, v1
1393; VI-NEXT:    v_mov_b32_e32 v0, s0
1394; VI-NEXT:    v_mov_b32_e32 v1, s1
1395; VI-NEXT:    flat_store_dword v[0:1], v2
1396; VI-NEXT:    s_endpgm
1397;
1398; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
1399; GFX9:       ; %bb.0:
1400; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1401; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1402; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1403; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1404; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
1405; GFX9-NEXT:    s_waitcnt vmcnt(0)
1406; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
1407; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
1408; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
1409; GFX9-NEXT:    s_endpgm
1410;
1411; CI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
1412; CI:       ; %bb.0:
1413; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1414; CI-NEXT:    s_mov_b32 s3, 0xf000
1415; CI-NEXT:    s_mov_b32 s6, 0
1416; CI-NEXT:    s_mov_b32 s7, s3
1417; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1418; CI-NEXT:    s_waitcnt lgkmcnt(0)
1419; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
1420; CI-NEXT:    v_mov_b32_e32 v1, 0
1421; CI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1422; CI-NEXT:    s_mov_b32 s2, -1
1423; CI-NEXT:    s_waitcnt vmcnt(0)
1424; CI-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
1425; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1426; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1427; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1428; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1429; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1430; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1431; CI-NEXT:    v_or_b32_e32 v0, v0, v1
1432; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1433; CI-NEXT:    s_endpgm
1434;
1435; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
1436; GFX11:       ; %bb.0:
1437; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1438; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1439; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
1440; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1441; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1442; GFX11-NEXT:    global_load_b32 v0, v0, s[0:1]
1443; GFX11-NEXT:    s_waitcnt vmcnt(0)
1444; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
1445; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
1446; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
1447; GFX11-NEXT:    s_endpgm
1448  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1449  %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
1450  %val = load <2 x half>, ptr addrspace(1) %gep
1451  %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
1452  %val.fabs.fneg = fneg <2 x half> %val.fabs
1453  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs.fneg)
1454  store <2 x half> %canonicalized, ptr addrspace(1) %out
1455  ret void
1456}
1457
1458define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %out) #1 {
1459; VI-LABEL: v_test_canonicalize_fneg_var_v2f16:
1460; VI:       ; %bb.0:
1461; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1462; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1463; VI-NEXT:    s_waitcnt lgkmcnt(0)
1464; VI-NEXT:    v_mov_b32_e32 v1, s1
1465; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1466; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1467; VI-NEXT:    flat_load_dword v0, v[0:1]
1468; VI-NEXT:    s_waitcnt vmcnt(0)
1469; VI-NEXT:    v_max_f16_sdwa v1, -v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1470; VI-NEXT:    v_max_f16_e64 v0, -v0, -v0
1471; VI-NEXT:    v_or_b32_e32 v2, v0, v1
1472; VI-NEXT:    v_mov_b32_e32 v0, s0
1473; VI-NEXT:    v_mov_b32_e32 v1, s1
1474; VI-NEXT:    flat_store_dword v[0:1], v2
1475; VI-NEXT:    s_endpgm
1476;
1477; GFX9-LABEL: v_test_canonicalize_fneg_var_v2f16:
1478; GFX9:       ; %bb.0:
1479; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1480; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1481; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1482; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1483; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
1484; GFX9-NEXT:    s_waitcnt vmcnt(0)
1485; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
1486; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
1487; GFX9-NEXT:    s_endpgm
1488;
1489; CI-LABEL: v_test_canonicalize_fneg_var_v2f16:
1490; CI:       ; %bb.0:
1491; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1492; CI-NEXT:    s_mov_b32 s3, 0xf000
1493; CI-NEXT:    s_mov_b32 s6, 0
1494; CI-NEXT:    s_mov_b32 s7, s3
1495; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1496; CI-NEXT:    s_waitcnt lgkmcnt(0)
1497; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
1498; CI-NEXT:    v_mov_b32_e32 v1, 0
1499; CI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1500; CI-NEXT:    s_mov_b32 s2, -1
1501; CI-NEXT:    s_waitcnt vmcnt(0)
1502; CI-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
1503; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1504; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1505; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1506; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1507; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1508; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1509; CI-NEXT:    v_or_b32_e32 v0, v0, v1
1510; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1511; CI-NEXT:    s_endpgm
1512;
1513; GFX11-LABEL: v_test_canonicalize_fneg_var_v2f16:
1514; GFX11:       ; %bb.0:
1515; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1516; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1517; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1518; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1519; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1520; GFX11-NEXT:    global_load_b32 v0, v0, s[0:1]
1521; GFX11-NEXT:    s_waitcnt vmcnt(0)
1522; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
1523; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
1524; GFX11-NEXT:    s_endpgm
1525  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1526  %gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid
1527  %val = load <2 x half>, ptr addrspace(1) %gep
1528  %fneg.val = fneg <2 x half> %val
1529  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val)
1530  store <2 x half> %canonicalized, ptr addrspace(1) %out
1531  ret void
1532}
1533
1534define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, i32 zeroext %val.arg) #1 {
1535; VI-LABEL: s_test_canonicalize_var_v2f16:
1536; VI:       ; %bb.0:
1537; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
1538; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1539; VI-NEXT:    s_waitcnt lgkmcnt(0)
1540; VI-NEXT:    s_lshr_b32 s3, s2, 16
1541; VI-NEXT:    v_mov_b32_e32 v1, s3
1542; VI-NEXT:    v_max_f16_e64 v0, s2, s2
1543; VI-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1544; VI-NEXT:    v_or_b32_e32 v2, v0, v1
1545; VI-NEXT:    v_mov_b32_e32 v0, s0
1546; VI-NEXT:    v_mov_b32_e32 v1, s1
1547; VI-NEXT:    flat_store_dword v[0:1], v2
1548; VI-NEXT:    s_endpgm
1549;
1550; GFX9-LABEL: s_test_canonicalize_var_v2f16:
1551; GFX9:       ; %bb.0:
1552; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
1553; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1554; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1555; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1556; GFX9-NEXT:    v_pk_max_f16 v1, s2, s2
1557; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1558; GFX9-NEXT:    s_endpgm
1559;
1560; CI-LABEL: s_test_canonicalize_var_v2f16:
1561; CI:       ; %bb.0:
1562; CI-NEXT:    s_load_dword s0, s[4:5], 0xb
1563; CI-NEXT:    s_mov_b32 s3, 0xf000
1564; CI-NEXT:    s_mov_b32 s2, -1
1565; CI-NEXT:    s_waitcnt lgkmcnt(0)
1566; CI-NEXT:    s_lshr_b32 s1, s0, 16
1567; CI-NEXT:    v_cvt_f32_f16_e32 v0, s1
1568; CI-NEXT:    v_cvt_f32_f16_e32 v1, s0
1569; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1570; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1571; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1572; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1573; CI-NEXT:    v_or_b32_e32 v0, v1, v0
1574; CI-NEXT:    s_waitcnt lgkmcnt(0)
1575; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1576; CI-NEXT:    s_endpgm
1577;
1578; GFX11-LABEL: s_test_canonicalize_var_v2f16:
1579; GFX11:       ; %bb.0:
1580; GFX11-NEXT:    s_clause 0x1
1581; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
1582; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1583; GFX11-NEXT:    v_mov_b32_e32 v0, 0
1584; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1585; GFX11-NEXT:    v_pk_max_f16 v1, s2, s2
1586; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1587; GFX11-NEXT:    s_endpgm
1588  %val = bitcast i32 %val.arg to <2 x half>
1589  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
1590  store <2 x half> %canonicalized, ptr addrspace(1) %out
1591  ret void
1592}
1593
1594define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out) #1 {
1595; VI-LABEL: test_fold_canonicalize_p0_v2f16:
1596; VI:       ; %bb.0:
1597; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1598; VI-NEXT:    v_mov_b32_e32 v2, 0
1599; VI-NEXT:    s_waitcnt lgkmcnt(0)
1600; VI-NEXT:    v_mov_b32_e32 v0, s0
1601; VI-NEXT:    v_mov_b32_e32 v1, s1
1602; VI-NEXT:    flat_store_dword v[0:1], v2
1603; VI-NEXT:    s_endpgm
1604;
1605; GFX9-LABEL: test_fold_canonicalize_p0_v2f16:
1606; GFX9:       ; %bb.0:
1607; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1608; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1609; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1610; GFX9-NEXT:    global_store_dword v0, v0, s[0:1]
1611; GFX9-NEXT:    s_endpgm
1612;
1613; CI-LABEL: test_fold_canonicalize_p0_v2f16:
1614; CI:       ; %bb.0:
1615; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1616; CI-NEXT:    s_mov_b32 s3, 0xf000
1617; CI-NEXT:    s_mov_b32 s2, -1
1618; CI-NEXT:    v_mov_b32_e32 v0, 0
1619; CI-NEXT:    s_waitcnt lgkmcnt(0)
1620; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1621; CI-NEXT:    s_endpgm
1622;
1623; GFX11-LABEL: test_fold_canonicalize_p0_v2f16:
1624; GFX11:       ; %bb.0:
1625; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1626; GFX11-NEXT:    v_mov_b32_e32 v0, 0
1627; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1628; GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
1629; GFX11-NEXT:    s_endpgm
1630  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer)
1631  store <2 x half> %canonicalized, ptr addrspace(1) %out
1632  ret void
1633}
1634
1635define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out) #1 {
1636; VI-LABEL: test_fold_canonicalize_n0_v2f16:
1637; VI:       ; %bb.0:
1638; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1639; VI-NEXT:    v_mov_b32_e32 v2, 0x80008000
1640; VI-NEXT:    s_waitcnt lgkmcnt(0)
1641; VI-NEXT:    v_mov_b32_e32 v0, s0
1642; VI-NEXT:    v_mov_b32_e32 v1, s1
1643; VI-NEXT:    flat_store_dword v[0:1], v2
1644; VI-NEXT:    s_endpgm
1645;
1646; GFX9-LABEL: test_fold_canonicalize_n0_v2f16:
1647; GFX9:       ; %bb.0:
1648; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1649; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1650; GFX9-NEXT:    v_mov_b32_e32 v1, 0x80008000
1651; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1652; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1653; GFX9-NEXT:    s_endpgm
1654;
1655; CI-LABEL: test_fold_canonicalize_n0_v2f16:
1656; CI:       ; %bb.0:
1657; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1658; CI-NEXT:    s_mov_b32 s3, 0xf000
1659; CI-NEXT:    s_mov_b32 s2, -1
1660; CI-NEXT:    v_mov_b32_e32 v0, 0x80008000
1661; CI-NEXT:    s_waitcnt lgkmcnt(0)
1662; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1663; CI-NEXT:    s_endpgm
1664;
1665; GFX11-LABEL: test_fold_canonicalize_n0_v2f16:
1666; GFX11:       ; %bb.0:
1667; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1668; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000
1669; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1670; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1671; GFX11-NEXT:    s_endpgm
1672  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -0.0, half -0.0>)
1673  store <2 x half> %canonicalized, ptr addrspace(1) %out
1674  ret void
1675}
1676
1677define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out) #1 {
1678; VI-LABEL: test_fold_canonicalize_p1_v2f16:
1679; VI:       ; %bb.0:
1680; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1681; VI-NEXT:    v_mov_b32_e32 v2, 0x3c003c00
1682; VI-NEXT:    s_waitcnt lgkmcnt(0)
1683; VI-NEXT:    v_mov_b32_e32 v0, s0
1684; VI-NEXT:    v_mov_b32_e32 v1, s1
1685; VI-NEXT:    flat_store_dword v[0:1], v2
1686; VI-NEXT:    s_endpgm
1687;
1688; GFX9-LABEL: test_fold_canonicalize_p1_v2f16:
1689; GFX9:       ; %bb.0:
1690; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1691; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1692; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3c003c00
1693; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1694; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1695; GFX9-NEXT:    s_endpgm
1696;
1697; CI-LABEL: test_fold_canonicalize_p1_v2f16:
1698; CI:       ; %bb.0:
1699; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1700; CI-NEXT:    s_mov_b32 s3, 0xf000
1701; CI-NEXT:    s_mov_b32 s2, -1
1702; CI-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
1703; CI-NEXT:    s_waitcnt lgkmcnt(0)
1704; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1705; CI-NEXT:    s_endpgm
1706;
1707; GFX11-LABEL: test_fold_canonicalize_p1_v2f16:
1708; GFX11:       ; %bb.0:
1709; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1710; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c003c00
1711; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1712; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1713; GFX11-NEXT:    s_endpgm
1714  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 1.0, half 1.0>)
1715  store <2 x half> %canonicalized, ptr addrspace(1) %out
1716  ret void
1717}
1718
1719define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out) #1 {
1720; VI-LABEL: test_fold_canonicalize_n1_v2f16:
1721; VI:       ; %bb.0:
1722; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1723; VI-NEXT:    v_mov_b32_e32 v2, 0xbc00bc00
1724; VI-NEXT:    s_waitcnt lgkmcnt(0)
1725; VI-NEXT:    v_mov_b32_e32 v0, s0
1726; VI-NEXT:    v_mov_b32_e32 v1, s1
1727; VI-NEXT:    flat_store_dword v[0:1], v2
1728; VI-NEXT:    s_endpgm
1729;
1730; GFX9-LABEL: test_fold_canonicalize_n1_v2f16:
1731; GFX9:       ; %bb.0:
1732; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1733; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1734; GFX9-NEXT:    v_mov_b32_e32 v1, 0xbc00bc00
1735; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1736; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1737; GFX9-NEXT:    s_endpgm
1738;
1739; CI-LABEL: test_fold_canonicalize_n1_v2f16:
1740; CI:       ; %bb.0:
1741; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1742; CI-NEXT:    s_mov_b32 s3, 0xf000
1743; CI-NEXT:    s_mov_b32 s2, -1
1744; CI-NEXT:    v_mov_b32_e32 v0, 0xbc00bc00
1745; CI-NEXT:    s_waitcnt lgkmcnt(0)
1746; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1747; CI-NEXT:    s_endpgm
1748;
1749; GFX11-LABEL: test_fold_canonicalize_n1_v2f16:
1750; GFX11:       ; %bb.0:
1751; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1752; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbc00bc00
1753; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1754; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1755; GFX11-NEXT:    s_endpgm
1756  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -1.0, half -1.0>)
1757  store <2 x half> %canonicalized, ptr addrspace(1) %out
1758  ret void
1759}
1760
1761define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) %out) #1 {
1762; VI-LABEL: test_fold_canonicalize_literal_v2f16:
1763; VI:       ; %bb.0:
1764; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1765; VI-NEXT:    v_mov_b32_e32 v2, 0x4c004c00
1766; VI-NEXT:    s_waitcnt lgkmcnt(0)
1767; VI-NEXT:    v_mov_b32_e32 v0, s0
1768; VI-NEXT:    v_mov_b32_e32 v1, s1
1769; VI-NEXT:    flat_store_dword v[0:1], v2
1770; VI-NEXT:    s_endpgm
1771;
1772; GFX9-LABEL: test_fold_canonicalize_literal_v2f16:
1773; GFX9:       ; %bb.0:
1774; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1775; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1776; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4c004c00
1777; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1778; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1779; GFX9-NEXT:    s_endpgm
1780;
1781; CI-LABEL: test_fold_canonicalize_literal_v2f16:
1782; CI:       ; %bb.0:
1783; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1784; CI-NEXT:    s_mov_b32 s3, 0xf000
1785; CI-NEXT:    s_mov_b32 s2, -1
1786; CI-NEXT:    v_mov_b32_e32 v0, 0x4c004c00
1787; CI-NEXT:    s_waitcnt lgkmcnt(0)
1788; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1789; CI-NEXT:    s_endpgm
1790;
1791; GFX11-LABEL: test_fold_canonicalize_literal_v2f16:
1792; GFX11:       ; %bb.0:
1793; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1794; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c004c00
1795; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1796; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1797; GFX11-NEXT:    s_endpgm
1798  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 16.0, half 16.0>)
1799  store <2 x half> %canonicalized, ptr addrspace(1) %out
1800  ret void
1801}
1802
1803define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #1 {
1804; VI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
1805; VI:       ; %bb.0:
1806; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1807; VI-NEXT:    v_mov_b32_e32 v2, 0x3ff03ff
1808; VI-NEXT:    s_waitcnt lgkmcnt(0)
1809; VI-NEXT:    v_mov_b32_e32 v0, s0
1810; VI-NEXT:    v_mov_b32_e32 v1, s1
1811; VI-NEXT:    flat_store_dword v[0:1], v2
1812; VI-NEXT:    s_endpgm
1813;
1814; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
1815; GFX9:       ; %bb.0:
1816; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1817; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1818; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3ff03ff
1819; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1820; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1821; GFX9-NEXT:    s_endpgm
1822;
1823; CI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
1824; CI:       ; %bb.0:
1825; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1826; CI-NEXT:    s_mov_b32 s3, 0xf000
1827; CI-NEXT:    s_mov_b32 s2, -1
1828; CI-NEXT:    v_mov_b32_e32 v0, 0x3ff03ff
1829; CI-NEXT:    s_waitcnt lgkmcnt(0)
1830; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1831; CI-NEXT:    s_endpgm
1832;
1833; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
1834; GFX11:       ; %bb.0:
1835; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1836; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
1837; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1838; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1839; GFX11-NEXT:    s_endpgm
1840  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
1841  store <2 x half> %canonicalized, ptr addrspace(1) %out
1842  ret void
1843}
1844
1845define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #3 {
1846; VI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
1847; VI:       ; %bb.0:
1848; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1849; VI-NEXT:    v_mov_b32_e32 v2, 0x3ff03ff
1850; VI-NEXT:    s_waitcnt lgkmcnt(0)
1851; VI-NEXT:    v_mov_b32_e32 v0, s0
1852; VI-NEXT:    v_mov_b32_e32 v1, s1
1853; VI-NEXT:    flat_store_dword v[0:1], v2
1854; VI-NEXT:    s_endpgm
1855;
1856; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
1857; GFX9:       ; %bb.0:
1858; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1859; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1860; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3ff03ff
1861; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1862; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1863; GFX9-NEXT:    s_endpgm
1864;
1865; CI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
1866; CI:       ; %bb.0:
1867; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1868; CI-NEXT:    s_mov_b32 s3, 0xf000
1869; CI-NEXT:    s_mov_b32 s2, -1
1870; CI-NEXT:    v_mov_b32_e32 v0, 0x3ff03ff
1871; CI-NEXT:    s_waitcnt lgkmcnt(0)
1872; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1873; CI-NEXT:    s_endpgm
1874;
1875; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
1876; GFX11:       ; %bb.0:
1877; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1878; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
1879; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1880; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1881; GFX11-NEXT:    s_endpgm
1882  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
1883  store <2 x half> %canonicalized, ptr addrspace(1) %out
1884  ret void
1885}
1886
1887define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #1 {
1888; VI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
1889; VI:       ; %bb.0:
1890; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1891; VI-NEXT:    v_mov_b32_e32 v2, 0x83ff83ff
1892; VI-NEXT:    s_waitcnt lgkmcnt(0)
1893; VI-NEXT:    v_mov_b32_e32 v0, s0
1894; VI-NEXT:    v_mov_b32_e32 v1, s1
1895; VI-NEXT:    flat_store_dword v[0:1], v2
1896; VI-NEXT:    s_endpgm
1897;
1898; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
1899; GFX9:       ; %bb.0:
1900; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1901; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1902; GFX9-NEXT:    v_mov_b32_e32 v1, 0x83ff83ff
1903; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1904; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1905; GFX9-NEXT:    s_endpgm
1906;
1907; CI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
1908; CI:       ; %bb.0:
1909; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1910; CI-NEXT:    s_mov_b32 s3, 0xf000
1911; CI-NEXT:    s_mov_b32 s2, -1
1912; CI-NEXT:    v_mov_b32_e32 v0, 0x83ff83ff
1913; CI-NEXT:    s_waitcnt lgkmcnt(0)
1914; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1915; CI-NEXT:    s_endpgm
1916;
1917; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
1918; GFX11:       ; %bb.0:
1919; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1920; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
1921; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1922; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1923; GFX11-NEXT:    s_endpgm
1924  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
1925  store <2 x half> %canonicalized, ptr addrspace(1) %out
1926  ret void
1927}
1928
1929define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #3 {
1930; VI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
1931; VI:       ; %bb.0:
1932; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1933; VI-NEXT:    v_mov_b32_e32 v2, 0x83ff83ff
1934; VI-NEXT:    s_waitcnt lgkmcnt(0)
1935; VI-NEXT:    v_mov_b32_e32 v0, s0
1936; VI-NEXT:    v_mov_b32_e32 v1, s1
1937; VI-NEXT:    flat_store_dword v[0:1], v2
1938; VI-NEXT:    s_endpgm
1939;
1940; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
1941; GFX9:       ; %bb.0:
1942; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1943; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1944; GFX9-NEXT:    v_mov_b32_e32 v1, 0x83ff83ff
1945; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1946; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1947; GFX9-NEXT:    s_endpgm
1948;
1949; CI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
1950; CI:       ; %bb.0:
1951; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1952; CI-NEXT:    s_mov_b32 s3, 0xf000
1953; CI-NEXT:    s_mov_b32 s2, -1
1954; CI-NEXT:    v_mov_b32_e32 v0, 0x83ff83ff
1955; CI-NEXT:    s_waitcnt lgkmcnt(0)
1956; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1957; CI-NEXT:    s_endpgm
1958;
1959; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
1960; GFX11:       ; %bb.0:
1961; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1962; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
1963; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1964; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1965; GFX11-NEXT:    s_endpgm
1966  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
1967  store <2 x half> %canonicalized, ptr addrspace(1) %out
1968  ret void
1969}
1970
1971define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %out) #1 {
1972; VI-LABEL: test_fold_canonicalize_qnan_v2f16:
1973; VI:       ; %bb.0:
1974; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1975; VI-NEXT:    v_mov_b32_e32 v2, 0x7c007c00
1976; VI-NEXT:    s_waitcnt lgkmcnt(0)
1977; VI-NEXT:    v_mov_b32_e32 v0, s0
1978; VI-NEXT:    v_mov_b32_e32 v1, s1
1979; VI-NEXT:    flat_store_dword v[0:1], v2
1980; VI-NEXT:    s_endpgm
1981;
1982; GFX9-LABEL: test_fold_canonicalize_qnan_v2f16:
1983; GFX9:       ; %bb.0:
1984; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1985; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1986; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7c007c00
1987; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1988; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1989; GFX9-NEXT:    s_endpgm
1990;
1991; CI-LABEL: test_fold_canonicalize_qnan_v2f16:
1992; CI:       ; %bb.0:
1993; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1994; CI-NEXT:    s_mov_b32 s3, 0xf000
1995; CI-NEXT:    s_mov_b32 s2, -1
1996; CI-NEXT:    v_mov_b32_e32 v0, 0x7c007c00
1997; CI-NEXT:    s_waitcnt lgkmcnt(0)
1998; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1999; CI-NEXT:    s_endpgm
2000;
2001; GFX11-LABEL: test_fold_canonicalize_qnan_v2f16:
2002; GFX11:       ; %bb.0:
2003; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2004; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00
2005; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2006; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2007; GFX11-NEXT:    s_endpgm
2008  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C00, half 0xH7C00>)
2009  store <2 x half> %canonicalized, ptr addrspace(1) %out
2010  ret void
2011}
2012
2013define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addrspace(1) %out) #1 {
2014; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
2015; VI:       ; %bb.0:
2016; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2017; VI-NEXT:    v_mov_b32_e32 v2, 0x7e007e00
2018; VI-NEXT:    s_waitcnt lgkmcnt(0)
2019; VI-NEXT:    v_mov_b32_e32 v0, s0
2020; VI-NEXT:    v_mov_b32_e32 v1, s1
2021; VI-NEXT:    flat_store_dword v[0:1], v2
2022; VI-NEXT:    s_endpgm
2023;
2024; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
2025; GFX9:       ; %bb.0:
2026; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2027; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2028; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
2029; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2030; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2031; GFX9-NEXT:    s_endpgm
2032;
2033; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
2034; CI:       ; %bb.0:
2035; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2036; CI-NEXT:    s_mov_b32 s3, 0xf000
2037; CI-NEXT:    s_mov_b32 s2, -1
2038; CI-NEXT:    v_mov_b32_e32 v0, 0x7e007e00
2039; CI-NEXT:    s_waitcnt lgkmcnt(0)
2040; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2041; CI-NEXT:    s_endpgm
2042;
2043; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
2044; GFX11:       ; %bb.0:
2045; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2046; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
2047; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2048; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2049; GFX11-NEXT:    s_endpgm
2050  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>))
2051  store <2 x half> %canonicalized, ptr addrspace(1) %out
2052  ret void
2053}
2054
2055define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addrspace(1) %out) #1 {
2056; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
2057; VI:       ; %bb.0:
2058; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2059; VI-NEXT:    v_mov_b32_e32 v2, 0x7e007e00
2060; VI-NEXT:    s_waitcnt lgkmcnt(0)
2061; VI-NEXT:    v_mov_b32_e32 v0, s0
2062; VI-NEXT:    v_mov_b32_e32 v1, s1
2063; VI-NEXT:    flat_store_dword v[0:1], v2
2064; VI-NEXT:    s_endpgm
2065;
2066; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
2067; GFX9:       ; %bb.0:
2068; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2069; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2070; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
2071; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2072; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2073; GFX9-NEXT:    s_endpgm
2074;
2075; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
2076; CI:       ; %bb.0:
2077; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2078; CI-NEXT:    s_mov_b32 s3, 0xf000
2079; CI-NEXT:    s_mov_b32 s2, -1
2080; CI-NEXT:    v_mov_b32_e32 v0, 0x7e007e00
2081; CI-NEXT:    s_waitcnt lgkmcnt(0)
2082; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2083; CI-NEXT:    s_endpgm
2084;
2085; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
2086; GFX11:       ; %bb.0:
2087; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2088; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
2089; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2090; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2091; GFX11-NEXT:    s_endpgm
2092  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half bitcast (i16 -2 to half), half bitcast (i16 -2 to half)>)
2093  store <2 x half> %canonicalized, ptr addrspace(1) %out
2094  ret void
2095}
2096
2097define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspace(1) %out) #1 {
2098; VI-LABEL: test_fold_canonicalize_snan0_value_v2f16:
2099; VI:       ; %bb.0:
2100; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2101; VI-NEXT:    v_mov_b32_e32 v2, 0x7e007e00
2102; VI-NEXT:    s_waitcnt lgkmcnt(0)
2103; VI-NEXT:    v_mov_b32_e32 v0, s0
2104; VI-NEXT:    v_mov_b32_e32 v1, s1
2105; VI-NEXT:    flat_store_dword v[0:1], v2
2106; VI-NEXT:    s_endpgm
2107;
2108; GFX9-LABEL: test_fold_canonicalize_snan0_value_v2f16:
2109; GFX9:       ; %bb.0:
2110; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2111; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2112; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
2113; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2114; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2115; GFX9-NEXT:    s_endpgm
2116;
2117; CI-LABEL: test_fold_canonicalize_snan0_value_v2f16:
2118; CI:       ; %bb.0:
2119; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2120; CI-NEXT:    s_mov_b32 s3, 0xf000
2121; CI-NEXT:    s_mov_b32 s2, -1
2122; CI-NEXT:    v_mov_b32_e32 v0, 0x7e007e00
2123; CI-NEXT:    s_waitcnt lgkmcnt(0)
2124; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2125; CI-NEXT:    s_endpgm
2126;
2127; GFX11-LABEL: test_fold_canonicalize_snan0_value_v2f16:
2128; GFX11:       ; %bb.0:
2129; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2130; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
2131; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2132; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2133; GFX11-NEXT:    s_endpgm
2134  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C01, half 0xH7C01>)
2135  store <2 x half> %canonicalized, ptr addrspace(1) %out
2136  ret void
2137}
2138
2139define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspace(1) %out) #1 {
2140; VI-LABEL: test_fold_canonicalize_snan1_value_v2f16:
2141; VI:       ; %bb.0:
2142; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2143; VI-NEXT:    v_mov_b32_e32 v2, 0x7e007e00
2144; VI-NEXT:    s_waitcnt lgkmcnt(0)
2145; VI-NEXT:    v_mov_b32_e32 v0, s0
2146; VI-NEXT:    v_mov_b32_e32 v1, s1
2147; VI-NEXT:    flat_store_dword v[0:1], v2
2148; VI-NEXT:    s_endpgm
2149;
2150; GFX9-LABEL: test_fold_canonicalize_snan1_value_v2f16:
2151; GFX9:       ; %bb.0:
2152; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2153; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2154; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
2155; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2156; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2157; GFX9-NEXT:    s_endpgm
2158;
2159; CI-LABEL: test_fold_canonicalize_snan1_value_v2f16:
2160; CI:       ; %bb.0:
2161; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2162; CI-NEXT:    s_mov_b32 s3, 0xf000
2163; CI-NEXT:    s_mov_b32 s2, -1
2164; CI-NEXT:    v_mov_b32_e32 v0, 0x7e007e00
2165; CI-NEXT:    s_waitcnt lgkmcnt(0)
2166; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2167; CI-NEXT:    s_endpgm
2168;
2169; GFX11-LABEL: test_fold_canonicalize_snan1_value_v2f16:
2170; GFX11:       ; %bb.0:
2171; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2172; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
2173; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2174; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2175; GFX11-NEXT:    s_endpgm
2176  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7DFF, half 0xH7DFF>)
2177  store <2 x half> %canonicalized, ptr addrspace(1) %out
2178  ret void
2179}
2180
2181define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspace(1) %out) #1 {
2182; VI-LABEL: test_fold_canonicalize_snan2_value_v2f16:
2183; VI:       ; %bb.0:
2184; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2185; VI-NEXT:    v_mov_b32_e32 v2, 0x7e007e00
2186; VI-NEXT:    s_waitcnt lgkmcnt(0)
2187; VI-NEXT:    v_mov_b32_e32 v0, s0
2188; VI-NEXT:    v_mov_b32_e32 v1, s1
2189; VI-NEXT:    flat_store_dword v[0:1], v2
2190; VI-NEXT:    s_endpgm
2191;
2192; GFX9-LABEL: test_fold_canonicalize_snan2_value_v2f16:
2193; GFX9:       ; %bb.0:
2194; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2195; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2196; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
2197; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2198; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2199; GFX9-NEXT:    s_endpgm
2200;
2201; CI-LABEL: test_fold_canonicalize_snan2_value_v2f16:
2202; CI:       ; %bb.0:
2203; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2204; CI-NEXT:    s_mov_b32 s3, 0xf000
2205; CI-NEXT:    s_mov_b32 s2, -1
2206; CI-NEXT:    v_mov_b32_e32 v0, 0x7e007e00
2207; CI-NEXT:    s_waitcnt lgkmcnt(0)
2208; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2209; CI-NEXT:    s_endpgm
2210;
2211; GFX11-LABEL: test_fold_canonicalize_snan2_value_v2f16:
2212; GFX11:       ; %bb.0:
2213; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2214; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
2215; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2216; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2217; GFX11-NEXT:    s_endpgm
2218  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFDFF, half 0xHFDFF>)
2219  store <2 x half> %canonicalized, ptr addrspace(1) %out
2220  ret void
2221}
2222
2223define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspace(1) %out) #1 {
2224; VI-LABEL: test_fold_canonicalize_snan3_value_v2f16:
2225; VI:       ; %bb.0:
2226; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2227; VI-NEXT:    v_mov_b32_e32 v2, 0x7e007e00
2228; VI-NEXT:    s_waitcnt lgkmcnt(0)
2229; VI-NEXT:    v_mov_b32_e32 v0, s0
2230; VI-NEXT:    v_mov_b32_e32 v1, s1
2231; VI-NEXT:    flat_store_dword v[0:1], v2
2232; VI-NEXT:    s_endpgm
2233;
2234; GFX9-LABEL: test_fold_canonicalize_snan3_value_v2f16:
2235; GFX9:       ; %bb.0:
2236; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2237; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2238; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
2239; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2240; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2241; GFX9-NEXT:    s_endpgm
2242;
2243; CI-LABEL: test_fold_canonicalize_snan3_value_v2f16:
2244; CI:       ; %bb.0:
2245; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2246; CI-NEXT:    s_mov_b32 s3, 0xf000
2247; CI-NEXT:    s_mov_b32 s2, -1
2248; CI-NEXT:    v_mov_b32_e32 v0, 0x7e007e00
2249; CI-NEXT:    s_waitcnt lgkmcnt(0)
2250; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2251; CI-NEXT:    s_endpgm
2252;
2253; GFX11-LABEL: test_fold_canonicalize_snan3_value_v2f16:
2254; GFX11:       ; %bb.0:
2255; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2256; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
2257; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2258; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2259; GFX11-NEXT:    s_endpgm
2260  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFC01, half 0xHFC01>)
2261  store <2 x half> %canonicalized, ptr addrspace(1) %out
2262  ret void
2263}
2264
2265define <3 x half> @v_test_canonicalize_var_v3f16(<3 x half> %val) #1 {
2266; VI-LABEL: v_test_canonicalize_var_v3f16:
2267; VI:       ; %bb.0:
2268; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2269; VI-NEXT:    v_max_f16_sdwa v2, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2270; VI-NEXT:    v_max_f16_e32 v0, v0, v0
2271; VI-NEXT:    v_max_f16_e32 v1, v1, v1
2272; VI-NEXT:    v_or_b32_e32 v0, v0, v2
2273; VI-NEXT:    s_setpc_b64 s[30:31]
2274;
2275; GFX9-LABEL: v_test_canonicalize_var_v3f16:
2276; GFX9:       ; %bb.0:
2277; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2278; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
2279; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
2280; GFX9-NEXT:    s_setpc_b64 s[30:31]
2281;
2282; CI-LABEL: v_test_canonicalize_var_v3f16:
2283; CI:       ; %bb.0:
2284; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2285; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2286; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2287; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2288; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
2289; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2290; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
2291; CI-NEXT:    s_setpc_b64 s[30:31]
2292;
2293; GFX11-LABEL: v_test_canonicalize_var_v3f16:
2294; GFX11:       ; %bb.0:
2295; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2296; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
2297; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
2298; GFX11-NEXT:    s_setpc_b64 s[30:31]
2299  %canonicalized = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> %val)
2300  ret <3 x half> %canonicalized
2301}
2302
2303define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 {
2304; VI-LABEL: v_test_canonicalize_var_v4f16:
2305; VI:       ; %bb.0:
2306; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2307; VI-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2308; VI-NEXT:    v_max_f16_sdwa v3, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2309; VI-NEXT:    v_max_f16_e32 v1, v1, v1
2310; VI-NEXT:    v_max_f16_e32 v0, v0, v0
2311; VI-NEXT:    v_or_b32_e32 v0, v0, v3
2312; VI-NEXT:    v_or_b32_e32 v1, v1, v2
2313; VI-NEXT:    s_setpc_b64 s[30:31]
2314;
2315; GFX9-LABEL: v_test_canonicalize_var_v4f16:
2316; GFX9:       ; %bb.0:
2317; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2318; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
2319; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
2320; GFX9-NEXT:    s_setpc_b64 s[30:31]
2321;
2322; CI-LABEL: v_test_canonicalize_var_v4f16:
2323; CI:       ; %bb.0:
2324; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2325; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2326; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2327; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2328; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
2329; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
2330; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2331; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
2332; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
2333; CI-NEXT:    s_setpc_b64 s[30:31]
2334;
2335; GFX11-LABEL: v_test_canonicalize_var_v4f16:
2336; GFX11:       ; %bb.0:
2337; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2338; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
2339; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
2340; GFX11-NEXT:    s_setpc_b64 s[30:31]
2341  %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %val)
2342  ret <4 x half> %canonicalized
2343}
2344
2345define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out) #1 {
2346; VI-LABEL: s_test_canonicalize_undef_v2f16:
2347; VI:       ; %bb.0:
2348; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2349; VI-NEXT:    v_mov_b32_e32 v2, 0
2350; VI-NEXT:    s_waitcnt lgkmcnt(0)
2351; VI-NEXT:    v_mov_b32_e32 v0, s0
2352; VI-NEXT:    v_mov_b32_e32 v1, s1
2353; VI-NEXT:    flat_store_dword v[0:1], v2
2354; VI-NEXT:    s_endpgm
2355;
2356; GFX9-LABEL: s_test_canonicalize_undef_v2f16:
2357; GFX9:       ; %bb.0:
2358; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2359; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2360; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2361; GFX9-NEXT:    global_store_dword v0, v0, s[0:1]
2362; GFX9-NEXT:    s_endpgm
2363;
2364; CI-LABEL: s_test_canonicalize_undef_v2f16:
2365; CI:       ; %bb.0:
2366; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2367; CI-NEXT:    s_mov_b32 s3, 0xf000
2368; CI-NEXT:    s_mov_b32 s2, -1
2369; CI-NEXT:    v_mov_b32_e32 v0, 0
2370; CI-NEXT:    s_waitcnt lgkmcnt(0)
2371; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2372; CI-NEXT:    s_endpgm
2373;
2374; GFX11-LABEL: s_test_canonicalize_undef_v2f16:
2375; GFX11:       ; %bb.0:
2376; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2377; GFX11-NEXT:    v_mov_b32_e32 v0, 0
2378; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2379; GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
2380; GFX11-NEXT:    s_endpgm
2381  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
2382  store <2 x half> %canonicalized, ptr addrspace(1) %out
2383  ret void
2384}
2385
2386define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
2387; VI-LABEL: v_test_canonicalize_reg_undef_v2f16:
2388; VI:       ; %bb.0:
2389; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2390; VI-NEXT:    v_max_f16_e32 v0, v0, v0
2391; VI-NEXT:    s_setpc_b64 s[30:31]
2392;
2393; GFX9-LABEL: v_test_canonicalize_reg_undef_v2f16:
2394; GFX9:       ; %bb.0:
2395; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2396; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
2397; GFX9-NEXT:    v_pack_b32_f16 v0, v0, 0
2398; GFX9-NEXT:    s_setpc_b64 s[30:31]
2399;
2400; CI-LABEL: v_test_canonicalize_reg_undef_v2f16:
2401; CI:       ; %bb.0:
2402; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2403; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2404; CI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
2405; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
2406; CI-NEXT:    s_setpc_b64 s[30:31]
2407;
2408; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_undef_v2f16:
2409; GFX11-TRUE16:       ; %bb.0:
2410; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2411; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
2412; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2413; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, 0
2414; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
2415;
2416; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_undef_v2f16:
2417; GFX11-FAKE16:       ; %bb.0:
2418; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2419; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
2420; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2421; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, 0
2422; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
2423  %vec = insertelement <2 x half> undef, half %val, i32 0
2424  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
2425  ret <2 x half> %canonicalized
2426}
2427
2428define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
2429; VI-LABEL: v_test_canonicalize_undef_reg_v2f16:
2430; VI:       ; %bb.0:
2431; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2432; VI-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2433; VI-NEXT:    s_setpc_b64 s[30:31]
2434;
2435; GFX9-LABEL: v_test_canonicalize_undef_reg_v2f16:
2436; GFX9:       ; %bb.0:
2437; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2438; GFX9-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2439; GFX9-NEXT:    s_setpc_b64 s[30:31]
2440;
2441; CI-LABEL: v_test_canonicalize_undef_reg_v2f16:
2442; CI:       ; %bb.0:
2443; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2444; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2445; CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
2446; CI-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
2447; CI-NEXT:    s_setpc_b64 s[30:31]
2448;
2449; GFX11-TRUE16-LABEL: v_test_canonicalize_undef_reg_v2f16:
2450; GFX11-TRUE16:       ; %bb.0:
2451; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2452; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
2453; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2454; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2455; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
2456;
2457; GFX11-FAKE16-LABEL: v_test_canonicalize_undef_reg_v2f16:
2458; GFX11-FAKE16:       ; %bb.0:
2459; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2460; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
2461; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2462; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2463; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
2464  %vec = insertelement <2 x half> undef, half %val, i32 1
2465  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
2466  ret <2 x half> %canonicalized
2467}
2468
2469define <2 x half> @v_test_canonicalize_undef_lo_imm_hi_v2f16() #1 {
2470; VI-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
2471; VI:       ; %bb.0:
2472; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2473; VI-NEXT:    v_bfrev_b32_e32 v0, 60
2474; VI-NEXT:    s_setpc_b64 s[30:31]
2475;
2476; GFX9-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
2477; GFX9:       ; %bb.0:
2478; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2479; GFX9-NEXT:    v_bfrev_b32_e32 v0, 60
2480; GFX9-NEXT:    s_setpc_b64 s[30:31]
2481;
2482; CI-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
2483; CI:       ; %bb.0:
2484; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2485; CI-NEXT:    v_mov_b32_e32 v0, 0
2486; CI-NEXT:    v_mov_b32_e32 v1, 1.0
2487; CI-NEXT:    s_setpc_b64 s[30:31]
2488;
2489; GFX11-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
2490; GFX11:       ; %bb.0:
2491; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2492; GFX11-NEXT:    v_bfrev_b32_e32 v0, 60
2493; GFX11-NEXT:    s_setpc_b64 s[30:31]
2494  %vec = insertelement <2 x half> undef, half 1.0, i32 1
2495  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
2496  ret <2 x half> %canonicalized
2497}
2498
2499define <2 x half> @v_test_canonicalize_imm_lo_undef_hi_v2f16() #1 {
2500; VI-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
2501; VI:       ; %bb.0:
2502; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2503; VI-NEXT:    v_mov_b32_e32 v0, 0x3c00
2504; VI-NEXT:    s_setpc_b64 s[30:31]
2505;
2506; GFX9-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
2507; GFX9:       ; %bb.0:
2508; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2509; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
2510; GFX9-NEXT:    s_setpc_b64 s[30:31]
2511;
2512; CI-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
2513; CI:       ; %bb.0:
2514; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2515; CI-NEXT:    v_mov_b32_e32 v0, 1.0
2516; CI-NEXT:    v_mov_b32_e32 v1, 0
2517; CI-NEXT:    s_setpc_b64 s[30:31]
2518;
2519; GFX11-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
2520; GFX11:       ; %bb.0:
2521; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2522; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3c00
2523; GFX11-NEXT:    s_setpc_b64 s[30:31]
2524  %vec = insertelement <2 x half> undef, half 1.0, i32 0
2525  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
2526  ret <2 x half> %canonicalized
2527}
2528
2529define <2 x half> @v_test_canonicalize_undef_lo_k_hi_v2f16() #1 {
2530; VI-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
2531; VI:       ; %bb.0:
2532; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2533; VI-NEXT:    v_bfrev_b32_e32 v0, 50
2534; VI-NEXT:    s_setpc_b64 s[30:31]
2535;
2536; GFX9-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
2537; GFX9:       ; %bb.0:
2538; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2539; GFX9-NEXT:    v_bfrev_b32_e32 v0, 50
2540; GFX9-NEXT:    s_setpc_b64 s[30:31]
2541;
2542; CI-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
2543; CI:       ; %bb.0:
2544; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2545; CI-NEXT:    v_mov_b32_e32 v0, 0
2546; CI-NEXT:    v_mov_b32_e32 v1, 0x41800000
2547; CI-NEXT:    s_setpc_b64 s[30:31]
2548;
2549; GFX11-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
2550; GFX11:       ; %bb.0:
2551; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2552; GFX11-NEXT:    v_bfrev_b32_e32 v0, 50
2553; GFX11-NEXT:    s_setpc_b64 s[30:31]
2554  %vec = insertelement <2 x half> undef, half 16.0, i32 1
2555  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
2556  ret <2 x half> %canonicalized
2557}
2558
2559define <2 x half> @v_test_canonicalize_k_lo_undef_hi_v2f16() #1 {
2560; VI-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
2561; VI:       ; %bb.0:
2562; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2563; VI-NEXT:    v_mov_b32_e32 v0, 0x4c00
2564; VI-NEXT:    s_setpc_b64 s[30:31]
2565;
2566; GFX9-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
2567; GFX9:       ; %bb.0:
2568; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2569; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4c00
2570; GFX9-NEXT:    s_setpc_b64 s[30:31]
2571;
2572; CI-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
2573; CI:       ; %bb.0:
2574; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2575; CI-NEXT:    v_mov_b32_e32 v0, 0x41800000
2576; CI-NEXT:    v_mov_b32_e32 v1, 0
2577; CI-NEXT:    s_setpc_b64 s[30:31]
2578;
2579; GFX11-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
2580; GFX11:       ; %bb.0:
2581; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2582; GFX11-NEXT:    v_mov_b32_e32 v0, 0x4c00
2583; GFX11-NEXT:    s_setpc_b64 s[30:31]
2584  %vec = insertelement <2 x half> undef, half 16.0, i32 0
2585  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
2586  ret <2 x half> %canonicalized
2587}
2588
2589define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 {
2590; VI-LABEL: v_test_canonicalize_reg_k_v2f16:
2591; VI:       ; %bb.0:
2592; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2593; VI-NEXT:    v_max_f16_e32 v0, v0, v0
2594; VI-NEXT:    v_or_b32_e32 v0, 2.0, v0
2595; VI-NEXT:    s_setpc_b64 s[30:31]
2596;
2597; GFX9-LABEL: v_test_canonicalize_reg_k_v2f16:
2598; GFX9:       ; %bb.0:
2599; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2600; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
2601; GFX9-NEXT:    v_pack_b32_f16 v0, v0, 2.0
2602; GFX9-NEXT:    s_setpc_b64 s[30:31]
2603;
2604; CI-LABEL: v_test_canonicalize_reg_k_v2f16:
2605; CI:       ; %bb.0:
2606; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2607; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2608; CI-NEXT:    v_mov_b32_e32 v1, 2.0
2609; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
2610; CI-NEXT:    s_setpc_b64 s[30:31]
2611;
2612; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_k_v2f16:
2613; GFX11-TRUE16:       ; %bb.0:
2614; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2615; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
2616; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2617; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, 2.0
2618; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
2619;
2620; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_k_v2f16:
2621; GFX11-FAKE16:       ; %bb.0:
2622; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2623; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
2624; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2625; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, 2.0
2626; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
2627  %vec0 = insertelement <2 x half> undef, half %val, i32 0
2628  %vec1 = insertelement <2 x half> %vec0, half 2.0, i32 1
2629  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
2630  ret <2 x half> %canonicalized
2631}
2632
2633define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 {
2634; VI-LABEL: v_test_canonicalize_k_reg_v2f16:
2635; VI:       ; %bb.0:
2636; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2637; VI-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2638; VI-NEXT:    v_or_b32_e32 v0, 0x4000, v0
2639; VI-NEXT:    s_setpc_b64 s[30:31]
2640;
2641; GFX9-LABEL: v_test_canonicalize_k_reg_v2f16:
2642; GFX9:       ; %bb.0:
2643; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2644; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
2645; GFX9-NEXT:    v_pack_b32_f16 v0, 2.0, v0
2646; GFX9-NEXT:    s_setpc_b64 s[30:31]
2647;
2648; CI-LABEL: v_test_canonicalize_k_reg_v2f16:
2649; CI:       ; %bb.0:
2650; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2651; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2652; CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
2653; CI-NEXT:    v_mov_b32_e32 v0, 2.0
2654; CI-NEXT:    s_setpc_b64 s[30:31]
2655;
2656; GFX11-TRUE16-LABEL: v_test_canonicalize_k_reg_v2f16:
2657; GFX11-TRUE16:       ; %bb.0:
2658; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2659; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
2660; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2661; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, 2.0, v0.l
2662; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
2663;
2664; GFX11-FAKE16-LABEL: v_test_canonicalize_k_reg_v2f16:
2665; GFX11-FAKE16:       ; %bb.0:
2666; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2667; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
2668; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2669; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, 2.0, v0
2670; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
2671  %vec0 = insertelement <2 x half> undef, half 2.0, i32 0
2672  %vec1 = insertelement <2 x half> %vec0, half %val, i32 1
2673  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
2674  ret <2 x half> %canonicalized
2675}
2676
2677define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out) #1 {
2678; VI-LABEL: s_test_canonicalize_undef_v4f16:
2679; VI:       ; %bb.0:
2680; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2681; VI-NEXT:    v_mov_b32_e32 v0, 0
2682; VI-NEXT:    v_mov_b32_e32 v1, v0
2683; VI-NEXT:    s_waitcnt lgkmcnt(0)
2684; VI-NEXT:    v_mov_b32_e32 v3, s1
2685; VI-NEXT:    v_mov_b32_e32 v2, s0
2686; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2687; VI-NEXT:    s_endpgm
2688;
2689; GFX9-LABEL: s_test_canonicalize_undef_v4f16:
2690; GFX9:       ; %bb.0:
2691; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2692; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2693; GFX9-NEXT:    v_mov_b32_e32 v1, v0
2694; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2695; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
2696; GFX9-NEXT:    s_endpgm
2697;
2698; CI-LABEL: s_test_canonicalize_undef_v4f16:
2699; CI:       ; %bb.0:
2700; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2701; CI-NEXT:    v_mov_b32_e32 v0, 0
2702; CI-NEXT:    s_mov_b32 s3, 0xf000
2703; CI-NEXT:    s_mov_b32 s2, -1
2704; CI-NEXT:    v_mov_b32_e32 v1, v0
2705; CI-NEXT:    s_waitcnt lgkmcnt(0)
2706; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2707; CI-NEXT:    s_endpgm
2708;
2709; GFX11-LABEL: s_test_canonicalize_undef_v4f16:
2710; GFX11:       ; %bb.0:
2711; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2712; GFX11-NEXT:    v_mov_b32_e32 v0, 0
2713; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2714; GFX11-NEXT:    v_mov_b32_e32 v1, v0
2715; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2716; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
2717; GFX11-NEXT:    s_endpgm
2718  %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
2719  store <4 x half> %canonicalized, ptr addrspace(1) %out
2720  ret void
2721}
2722
2723define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 {
2724; VI-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
2725; VI:       ; %bb.0:
2726; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2727; VI-NEXT:    v_max_f16_e32 v0, v0, v0
2728; VI-NEXT:    v_or_b32_e32 v0, 0x7e000000, v0
2729; VI-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
2730; VI-NEXT:    s_setpc_b64 s[30:31]
2731;
2732; GFX9-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
2733; GFX9:       ; %bb.0:
2734; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2735; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
2736; GFX9-NEXT:    v_pack_b32_f16 v0, v0, 0
2737; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
2738; GFX9-NEXT:    s_setpc_b64 s[30:31]
2739;
2740; CI-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
2741; CI:       ; %bb.0:
2742; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2743; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2744; CI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
2745; CI-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
2746; CI-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
2747; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
2748; CI-NEXT:    s_setpc_b64 s[30:31]
2749;
2750; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
2751; GFX11-TRUE16:       ; %bb.0:
2752; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2753; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
2754; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
2755; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2756; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, 0
2757; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
2758;
2759; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
2760; GFX11-FAKE16:       ; %bb.0:
2761; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2762; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
2763; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
2764; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2765; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, 0
2766; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
2767  %vec = insertelement <4 x half> undef, half %val, i32 0
2768  %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec)
2769  ret <4 x half> %canonicalized
2770}
2771
2772define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, half %val1) #1 {
2773; VI-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
2774; VI:       ; %bb.0:
2775; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2776; VI-NEXT:    v_max_f16_e32 v0, v0, v0
2777; VI-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2778; VI-NEXT:    v_or_b32_e32 v0, v0, v1
2779; VI-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
2780; VI-NEXT:    s_setpc_b64 s[30:31]
2781;
2782; GFX9-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
2783; GFX9:       ; %bb.0:
2784; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2785; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
2786; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
2787; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
2788; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
2789; GFX9-NEXT:    s_setpc_b64 s[30:31]
2790;
2791; CI-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
2792; CI:       ; %bb.0:
2793; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2794; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2795; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2796; CI-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
2797; CI-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
2798; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
2799; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2800; CI-NEXT:    s_setpc_b64 s[30:31]
2801;
2802; GFX11-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
2803; GFX11:       ; %bb.0:
2804; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2805; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
2806; GFX11-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
2807; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2808; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
2809; GFX11-NEXT:    s_setpc_b64 s[30:31]
2810  %vec0 = insertelement <4 x half> undef, half %val0, i32 0
2811  %vec1 = insertelement <4 x half> %vec0, half %val1, i32 1
2812  %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec1)
2813  ret <4 x half> %canonicalized
2814}
2815
2816define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half %val1, half %val2) #1 {
2817; VI-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
2818; VI:       ; %bb.0:
2819; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2820; VI-NEXT:    v_max_f16_e32 v1, v1, v1
2821; VI-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2822; VI-NEXT:    v_max_f16_e32 v0, v0, v0
2823; VI-NEXT:    v_or_b32_e32 v0, 0x7e000000, v0
2824; VI-NEXT:    v_or_b32_e32 v1, v1, v2
2825; VI-NEXT:    s_setpc_b64 s[30:31]
2826;
2827; GFX9-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
2828; GFX9:       ; %bb.0:
2829; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2830; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
2831; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
2832; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
2833; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
2834; GFX9-NEXT:    v_pack_b32_f16 v0, v0, 0
2835; GFX9-NEXT:    s_setpc_b64 s[30:31]
2836;
2837; CI-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
2838; CI:       ; %bb.0:
2839; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2840; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2841; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2842; CI-NEXT:    v_cvt_f16_f32_e32 v3, v2
2843; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
2844; CI-NEXT:    v_cvt_f32_f16_e32 v2, v1
2845; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
2846; CI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
2847; CI-NEXT:    s_setpc_b64 s[30:31]
2848;
2849; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
2850; GFX11-TRUE16:       ; %bb.0:
2851; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2852; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
2853; GFX11-TRUE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
2854; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2855; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, 0
2856; GFX11-TRUE16-NEXT:    v_pk_max_f16 v1, v1, v1
2857; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
2858;
2859; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
2860; GFX11-FAKE16:       ; %bb.0:
2861; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2862; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
2863; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
2864; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2865; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, 0
2866; GFX11-FAKE16-NEXT:    v_pk_max_f16 v1, v1, v1
2867; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
2868  %vec0 = insertelement <4 x half> undef, half %val0, i32 0
2869  %vec1 = insertelement <4 x half> %vec0, half %val1, i32 2
2870  %vec2 = insertelement <4 x half> %vec1, half %val2, i32 3
2871  %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec2)
2872  ret <4 x half> %canonicalized
2873}
2874
2875define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 {
2876; VI-LABEL: v_test_canonicalize_var_v6f16:
2877; VI:       ; %bb.0:
2878; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2879; VI-NEXT:    v_max_f16_sdwa v3, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2880; VI-NEXT:    v_max_f16_sdwa v4, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2881; VI-NEXT:    v_max_f16_sdwa v5, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2882; VI-NEXT:    v_max_f16_e32 v2, v2, v2
2883; VI-NEXT:    v_max_f16_e32 v1, v1, v1
2884; VI-NEXT:    v_max_f16_e32 v0, v0, v0
2885; VI-NEXT:    v_or_b32_e32 v0, v0, v5
2886; VI-NEXT:    v_or_b32_e32 v1, v1, v4
2887; VI-NEXT:    v_or_b32_e32 v2, v2, v3
2888; VI-NEXT:    s_setpc_b64 s[30:31]
2889;
2890; GFX9-LABEL: v_test_canonicalize_var_v6f16:
2891; GFX9:       ; %bb.0:
2892; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2893; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
2894; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
2895; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
2896; GFX9-NEXT:    s_setpc_b64 s[30:31]
2897;
2898; CI-LABEL: v_test_canonicalize_var_v6f16:
2899; CI:       ; %bb.0:
2900; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2901; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
2902; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
2903; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2904; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2905; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2906; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
2907; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
2908; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
2909; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
2910; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2911; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
2912; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
2913; CI-NEXT:    s_setpc_b64 s[30:31]
2914;
2915; GFX11-LABEL: v_test_canonicalize_var_v6f16:
2916; GFX11:       ; %bb.0:
2917; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2918; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
2919; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
2920; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
2921; GFX11-NEXT:    s_setpc_b64 s[30:31]
2922  %canonicalized = call <6 x half> @llvm.canonicalize.v6f16(<6 x half> %val)
2923  ret <6 x half> %canonicalized
2924}
2925
2926define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 {
2927; VI-LABEL: v_test_canonicalize_var_v8f16:
2928; VI:       ; %bb.0:
2929; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2930; VI-NEXT:    v_max_f16_sdwa v4, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2931; VI-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2932; VI-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2933; VI-NEXT:    v_max_f16_sdwa v7, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2934; VI-NEXT:    v_max_f16_e32 v3, v3, v3
2935; VI-NEXT:    v_max_f16_e32 v2, v2, v2
2936; VI-NEXT:    v_max_f16_e32 v1, v1, v1
2937; VI-NEXT:    v_max_f16_e32 v0, v0, v0
2938; VI-NEXT:    v_or_b32_e32 v0, v0, v7
2939; VI-NEXT:    v_or_b32_e32 v1, v1, v6
2940; VI-NEXT:    v_or_b32_e32 v2, v2, v5
2941; VI-NEXT:    v_or_b32_e32 v3, v3, v4
2942; VI-NEXT:    s_setpc_b64 s[30:31]
2943;
2944; GFX9-LABEL: v_test_canonicalize_var_v8f16:
2945; GFX9:       ; %bb.0:
2946; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2947; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
2948; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
2949; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
2950; GFX9-NEXT:    v_pk_max_f16 v3, v3, v3
2951; GFX9-NEXT:    s_setpc_b64 s[30:31]
2952;
2953; CI-LABEL: v_test_canonicalize_var_v8f16:
2954; CI:       ; %bb.0:
2955; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2956; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
2957; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
2958; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
2959; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
2960; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2961; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2962; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2963; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
2964; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
2965; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
2966; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
2967; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
2968; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
2969; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2970; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
2971; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
2972; CI-NEXT:    s_setpc_b64 s[30:31]
2973;
2974; GFX11-LABEL: v_test_canonicalize_var_v8f16:
2975; GFX11:       ; %bb.0:
2976; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2977; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
2978; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
2979; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
2980; GFX11-NEXT:    v_pk_max_f16 v3, v3, v3
2981; GFX11-NEXT:    s_setpc_b64 s[30:31]
2982  %canonicalized = call <8 x half> @llvm.canonicalize.v8f16(<8 x half> %val)
2983  ret <8 x half> %canonicalized
2984}
2985
2986define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 {
2987; VI-LABEL: v_test_canonicalize_var_v12f16:
2988; VI:       ; %bb.0:
2989; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2990; VI-NEXT:    v_max_f16_sdwa v6, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2991; VI-NEXT:    v_max_f16_sdwa v7, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2992; VI-NEXT:    v_max_f16_sdwa v8, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2993; VI-NEXT:    v_max_f16_sdwa v9, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2994; VI-NEXT:    v_max_f16_sdwa v10, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2995; VI-NEXT:    v_max_f16_sdwa v11, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2996; VI-NEXT:    v_max_f16_e32 v5, v5, v5
2997; VI-NEXT:    v_max_f16_e32 v4, v4, v4
2998; VI-NEXT:    v_max_f16_e32 v3, v3, v3
2999; VI-NEXT:    v_max_f16_e32 v2, v2, v2
3000; VI-NEXT:    v_max_f16_e32 v1, v1, v1
3001; VI-NEXT:    v_max_f16_e32 v0, v0, v0
3002; VI-NEXT:    v_or_b32_e32 v0, v0, v11
3003; VI-NEXT:    v_or_b32_e32 v1, v1, v10
3004; VI-NEXT:    v_or_b32_e32 v2, v2, v9
3005; VI-NEXT:    v_or_b32_e32 v3, v3, v8
3006; VI-NEXT:    v_or_b32_e32 v4, v4, v7
3007; VI-NEXT:    v_or_b32_e32 v5, v5, v6
3008; VI-NEXT:    s_setpc_b64 s[30:31]
3009;
3010; GFX9-LABEL: v_test_canonicalize_var_v12f16:
3011; GFX9:       ; %bb.0:
3012; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3013; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
3014; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
3015; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
3016; GFX9-NEXT:    v_pk_max_f16 v3, v3, v3
3017; GFX9-NEXT:    v_pk_max_f16 v4, v4, v4
3018; GFX9-NEXT:    v_pk_max_f16 v5, v5, v5
3019; GFX9-NEXT:    s_setpc_b64 s[30:31]
3020;
3021; CI-LABEL: v_test_canonicalize_var_v12f16:
3022; CI:       ; %bb.0:
3023; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3024; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
3025; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
3026; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
3027; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
3028; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
3029; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
3030; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
3031; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
3032; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
3033; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
3034; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
3035; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
3036; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
3037; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
3038; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
3039; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
3040; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
3041; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
3042; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
3043; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
3044; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
3045; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
3046; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
3047; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
3048; CI-NEXT:    s_setpc_b64 s[30:31]
3049;
3050; GFX11-LABEL: v_test_canonicalize_var_v12f16:
3051; GFX11:       ; %bb.0:
3052; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3053; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
3054; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
3055; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
3056; GFX11-NEXT:    v_pk_max_f16 v3, v3, v3
3057; GFX11-NEXT:    v_pk_max_f16 v4, v4, v4
3058; GFX11-NEXT:    v_pk_max_f16 v5, v5, v5
3059; GFX11-NEXT:    s_setpc_b64 s[30:31]
3060  %canonicalized = call <12 x half> @llvm.canonicalize.v12f16(<12 x half> %val)
3061  ret <12 x half> %canonicalized
3062}
3063
3064define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 {
3065; VI-LABEL: v_test_canonicalize_var_v16f16:
3066; VI:       ; %bb.0:
3067; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3068; VI-NEXT:    v_max_f16_sdwa v8, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3069; VI-NEXT:    v_max_f16_sdwa v9, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3070; VI-NEXT:    v_max_f16_sdwa v10, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3071; VI-NEXT:    v_max_f16_sdwa v11, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3072; VI-NEXT:    v_max_f16_sdwa v12, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3073; VI-NEXT:    v_max_f16_sdwa v13, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3074; VI-NEXT:    v_max_f16_sdwa v14, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3075; VI-NEXT:    v_max_f16_sdwa v15, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3076; VI-NEXT:    v_max_f16_e32 v7, v7, v7
3077; VI-NEXT:    v_max_f16_e32 v6, v6, v6
3078; VI-NEXT:    v_max_f16_e32 v5, v5, v5
3079; VI-NEXT:    v_max_f16_e32 v4, v4, v4
3080; VI-NEXT:    v_max_f16_e32 v3, v3, v3
3081; VI-NEXT:    v_max_f16_e32 v2, v2, v2
3082; VI-NEXT:    v_max_f16_e32 v1, v1, v1
3083; VI-NEXT:    v_max_f16_e32 v0, v0, v0
3084; VI-NEXT:    v_or_b32_e32 v0, v0, v15
3085; VI-NEXT:    v_or_b32_e32 v1, v1, v14
3086; VI-NEXT:    v_or_b32_e32 v2, v2, v13
3087; VI-NEXT:    v_or_b32_e32 v3, v3, v12
3088; VI-NEXT:    v_or_b32_e32 v4, v4, v11
3089; VI-NEXT:    v_or_b32_e32 v5, v5, v10
3090; VI-NEXT:    v_or_b32_e32 v6, v6, v9
3091; VI-NEXT:    v_or_b32_e32 v7, v7, v8
3092; VI-NEXT:    s_setpc_b64 s[30:31]
3093;
3094; GFX9-LABEL: v_test_canonicalize_var_v16f16:
3095; GFX9:       ; %bb.0:
3096; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3097; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
3098; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
3099; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
3100; GFX9-NEXT:    v_pk_max_f16 v3, v3, v3
3101; GFX9-NEXT:    v_pk_max_f16 v4, v4, v4
3102; GFX9-NEXT:    v_pk_max_f16 v5, v5, v5
3103; GFX9-NEXT:    v_pk_max_f16 v6, v6, v6
3104; GFX9-NEXT:    v_pk_max_f16 v7, v7, v7
3105; GFX9-NEXT:    s_setpc_b64 s[30:31]
3106;
3107; CI-LABEL: v_test_canonicalize_var_v16f16:
3108; CI:       ; %bb.0:
3109; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3110; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
3111; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
3112; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
3113; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
3114; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
3115; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
3116; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
3117; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
3118; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
3119; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
3120; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
3121; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
3122; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
3123; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
3124; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
3125; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
3126; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
3127; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
3128; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
3129; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
3130; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
3131; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
3132; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
3133; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
3134; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
3135; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
3136; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
3137; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
3138; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
3139; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
3140; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
3141; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
3142; CI-NEXT:    s_setpc_b64 s[30:31]
3143;
3144; GFX11-LABEL: v_test_canonicalize_var_v16f16:
3145; GFX11:       ; %bb.0:
3146; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3147; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
3148; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
3149; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
3150; GFX11-NEXT:    v_pk_max_f16 v3, v3, v3
3151; GFX11-NEXT:    v_pk_max_f16 v4, v4, v4
3152; GFX11-NEXT:    v_pk_max_f16 v5, v5, v5
3153; GFX11-NEXT:    v_pk_max_f16 v6, v6, v6
3154; GFX11-NEXT:    v_pk_max_f16 v7, v7, v7
3155; GFX11-NEXT:    s_setpc_b64 s[30:31]
3156  %canonicalized = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> %val)
3157  ret <16 x half> %canonicalized
3158}
3159
3160define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 {
3161; VI-LABEL: v_test_canonicalize_var_v32f16:
3162; VI:       ; %bb.0:
3163; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3164; VI-NEXT:    v_max_f16_sdwa v19, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3165; VI-NEXT:    v_max_f16_e32 v0, v0, v0
3166; VI-NEXT:    v_or_b32_e32 v0, v0, v19
3167; VI-NEXT:    v_max_f16_sdwa v19, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3168; VI-NEXT:    v_max_f16_e32 v1, v1, v1
3169; VI-NEXT:    v_or_b32_e32 v1, v1, v19
3170; VI-NEXT:    v_max_f16_sdwa v19, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3171; VI-NEXT:    v_max_f16_e32 v2, v2, v2
3172; VI-NEXT:    v_or_b32_e32 v2, v2, v19
3173; VI-NEXT:    v_max_f16_sdwa v19, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3174; VI-NEXT:    v_max_f16_e32 v3, v3, v3
3175; VI-NEXT:    v_or_b32_e32 v3, v3, v19
3176; VI-NEXT:    v_max_f16_sdwa v19, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3177; VI-NEXT:    v_max_f16_e32 v4, v4, v4
3178; VI-NEXT:    v_or_b32_e32 v4, v4, v19
3179; VI-NEXT:    v_max_f16_sdwa v19, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3180; VI-NEXT:    v_max_f16_e32 v5, v5, v5
3181; VI-NEXT:    v_or_b32_e32 v5, v5, v19
3182; VI-NEXT:    v_max_f16_sdwa v19, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3183; VI-NEXT:    v_max_f16_e32 v6, v6, v6
3184; VI-NEXT:    v_or_b32_e32 v6, v6, v19
3185; VI-NEXT:    v_max_f16_sdwa v19, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3186; VI-NEXT:    v_max_f16_e32 v7, v7, v7
3187; VI-NEXT:    v_or_b32_e32 v7, v7, v19
3188; VI-NEXT:    v_max_f16_sdwa v19, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3189; VI-NEXT:    v_max_f16_e32 v8, v8, v8
3190; VI-NEXT:    v_or_b32_e32 v8, v8, v19
3191; VI-NEXT:    v_max_f16_sdwa v19, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3192; VI-NEXT:    v_max_f16_e32 v9, v9, v9
3193; VI-NEXT:    v_or_b32_e32 v9, v9, v19
3194; VI-NEXT:    v_max_f16_sdwa v19, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3195; VI-NEXT:    v_max_f16_e32 v10, v10, v10
3196; VI-NEXT:    v_or_b32_e32 v10, v10, v19
3197; VI-NEXT:    v_max_f16_sdwa v19, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3198; VI-NEXT:    v_max_f16_e32 v11, v11, v11
3199; VI-NEXT:    v_max_f16_sdwa v16, v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3200; VI-NEXT:    v_max_f16_sdwa v17, v14, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3201; VI-NEXT:    v_max_f16_sdwa v18, v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3202; VI-NEXT:    v_or_b32_e32 v11, v11, v19
3203; VI-NEXT:    v_max_f16_sdwa v19, v12, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3204; VI-NEXT:    v_max_f16_e32 v15, v15, v15
3205; VI-NEXT:    v_max_f16_e32 v14, v14, v14
3206; VI-NEXT:    v_max_f16_e32 v13, v13, v13
3207; VI-NEXT:    v_max_f16_e32 v12, v12, v12
3208; VI-NEXT:    v_or_b32_e32 v12, v12, v19
3209; VI-NEXT:    v_or_b32_e32 v13, v13, v18
3210; VI-NEXT:    v_or_b32_e32 v14, v14, v17
3211; VI-NEXT:    v_or_b32_e32 v15, v15, v16
3212; VI-NEXT:    s_setpc_b64 s[30:31]
3213;
3214; GFX9-LABEL: v_test_canonicalize_var_v32f16:
3215; GFX9:       ; %bb.0:
3216; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3217; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
3218; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
3219; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
3220; GFX9-NEXT:    v_pk_max_f16 v3, v3, v3
3221; GFX9-NEXT:    v_pk_max_f16 v4, v4, v4
3222; GFX9-NEXT:    v_pk_max_f16 v5, v5, v5
3223; GFX9-NEXT:    v_pk_max_f16 v6, v6, v6
3224; GFX9-NEXT:    v_pk_max_f16 v7, v7, v7
3225; GFX9-NEXT:    v_pk_max_f16 v8, v8, v8
3226; GFX9-NEXT:    v_pk_max_f16 v9, v9, v9
3227; GFX9-NEXT:    v_pk_max_f16 v10, v10, v10
3228; GFX9-NEXT:    v_pk_max_f16 v11, v11, v11
3229; GFX9-NEXT:    v_pk_max_f16 v12, v12, v12
3230; GFX9-NEXT:    v_pk_max_f16 v13, v13, v13
3231; GFX9-NEXT:    v_pk_max_f16 v14, v14, v14
3232; GFX9-NEXT:    v_pk_max_f16 v15, v15, v15
3233; GFX9-NEXT:    s_setpc_b64 s[30:31]
3234;
3235; CI-LABEL: v_test_canonicalize_var_v32f16:
3236; CI:       ; %bb.0:
3237; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3238; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
3239; CI-NEXT:    v_cvt_f16_f32_e32 v30, v30
3240; CI-NEXT:    v_cvt_f16_f32_e32 v29, v29
3241; CI-NEXT:    v_cvt_f16_f32_e32 v28, v28
3242; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
3243; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
3244; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
3245; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
3246; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
3247; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
3248; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
3249; CI-NEXT:    v_cvt_f16_f32_e32 v20, v20
3250; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
3251; CI-NEXT:    v_cvt_f16_f32_e32 v18, v18
3252; CI-NEXT:    v_cvt_f16_f32_e32 v17, v17
3253; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
3254; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
3255; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
3256; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
3257; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
3258; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
3259; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
3260; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
3261; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
3262; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
3263; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
3264; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
3265; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
3266; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
3267; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
3268; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
3269; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
3270; CI-NEXT:    v_cvt_f32_f16_e32 v30, v30
3271; CI-NEXT:    v_cvt_f32_f16_e32 v29, v29
3272; CI-NEXT:    v_cvt_f32_f16_e32 v28, v28
3273; CI-NEXT:    v_cvt_f32_f16_e32 v27, v27
3274; CI-NEXT:    v_cvt_f32_f16_e32 v26, v26
3275; CI-NEXT:    v_cvt_f32_f16_e32 v25, v25
3276; CI-NEXT:    v_cvt_f32_f16_e32 v24, v24
3277; CI-NEXT:    v_cvt_f32_f16_e32 v23, v23
3278; CI-NEXT:    v_cvt_f32_f16_e32 v22, v22
3279; CI-NEXT:    v_cvt_f32_f16_e32 v21, v21
3280; CI-NEXT:    v_cvt_f32_f16_e32 v20, v20
3281; CI-NEXT:    v_cvt_f32_f16_e32 v19, v19
3282; CI-NEXT:    v_cvt_f32_f16_e32 v18, v18
3283; CI-NEXT:    v_cvt_f32_f16_e32 v17, v17
3284; CI-NEXT:    v_cvt_f32_f16_e32 v16, v16
3285; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
3286; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
3287; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
3288; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
3289; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
3290; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
3291; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
3292; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
3293; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
3294; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
3295; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
3296; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
3297; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
3298; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
3299; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
3300; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
3301; CI-NEXT:    s_waitcnt vmcnt(0)
3302; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3303; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
3304; CI-NEXT:    s_setpc_b64 s[30:31]
3305;
3306; GFX11-LABEL: v_test_canonicalize_var_v32f16:
3307; GFX11:       ; %bb.0:
3308; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3309; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
3310; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
3311; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
3312; GFX11-NEXT:    v_pk_max_f16 v3, v3, v3
3313; GFX11-NEXT:    v_pk_max_f16 v4, v4, v4
3314; GFX11-NEXT:    v_pk_max_f16 v5, v5, v5
3315; GFX11-NEXT:    v_pk_max_f16 v6, v6, v6
3316; GFX11-NEXT:    v_pk_max_f16 v7, v7, v7
3317; GFX11-NEXT:    v_pk_max_f16 v8, v8, v8
3318; GFX11-NEXT:    v_pk_max_f16 v9, v9, v9
3319; GFX11-NEXT:    v_pk_max_f16 v10, v10, v10
3320; GFX11-NEXT:    v_pk_max_f16 v11, v11, v11
3321; GFX11-NEXT:    v_pk_max_f16 v12, v12, v12
3322; GFX11-NEXT:    v_pk_max_f16 v13, v13, v13
3323; GFX11-NEXT:    v_pk_max_f16 v14, v14, v14
3324; GFX11-NEXT:    v_pk_max_f16 v15, v15, v15
3325; GFX11-NEXT:    s_setpc_b64 s[30:31]
3326  %canonicalized = call <32 x half> @llvm.canonicalize.v32f16(<32 x half> %val)
3327  ret <32 x half> %canonicalized
3328}
3329
3330define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
3331; VI-LABEL: v_test_canonicalize_var_v64f16:
3332; VI:       ; %bb.0:
3333; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3334; VI-NEXT:    v_max_f16_sdwa v31, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3335; VI-NEXT:    v_max_f16_e32 v0, v0, v0
3336; VI-NEXT:    v_or_b32_e32 v0, v0, v31
3337; VI-NEXT:    v_max_f16_sdwa v31, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3338; VI-NEXT:    v_max_f16_e32 v1, v1, v1
3339; VI-NEXT:    v_or_b32_e32 v1, v1, v31
3340; VI-NEXT:    v_max_f16_sdwa v31, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3341; VI-NEXT:    v_max_f16_e32 v2, v2, v2
3342; VI-NEXT:    v_or_b32_e32 v2, v2, v31
3343; VI-NEXT:    v_max_f16_sdwa v31, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3344; VI-NEXT:    v_max_f16_e32 v3, v3, v3
3345; VI-NEXT:    v_or_b32_e32 v3, v3, v31
3346; VI-NEXT:    v_max_f16_sdwa v31, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3347; VI-NEXT:    v_max_f16_e32 v4, v4, v4
3348; VI-NEXT:    v_or_b32_e32 v4, v4, v31
3349; VI-NEXT:    v_max_f16_sdwa v31, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3350; VI-NEXT:    v_max_f16_e32 v5, v5, v5
3351; VI-NEXT:    v_or_b32_e32 v5, v5, v31
3352; VI-NEXT:    v_max_f16_sdwa v31, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3353; VI-NEXT:    v_max_f16_e32 v6, v6, v6
3354; VI-NEXT:    v_or_b32_e32 v6, v6, v31
3355; VI-NEXT:    v_max_f16_sdwa v31, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3356; VI-NEXT:    v_max_f16_e32 v7, v7, v7
3357; VI-NEXT:    v_or_b32_e32 v7, v7, v31
3358; VI-NEXT:    v_max_f16_sdwa v31, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3359; VI-NEXT:    v_max_f16_e32 v8, v8, v8
3360; VI-NEXT:    v_or_b32_e32 v8, v8, v31
3361; VI-NEXT:    v_max_f16_sdwa v31, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3362; VI-NEXT:    v_max_f16_e32 v9, v9, v9
3363; VI-NEXT:    v_or_b32_e32 v9, v9, v31
3364; VI-NEXT:    v_max_f16_sdwa v31, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3365; VI-NEXT:    v_max_f16_e32 v10, v10, v10
3366; VI-NEXT:    v_or_b32_e32 v10, v10, v31
3367; VI-NEXT:    v_max_f16_sdwa v31, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3368; VI-NEXT:    v_max_f16_e32 v11, v11, v11
3369; VI-NEXT:    v_or_b32_e32 v11, v11, v31
3370; VI-NEXT:    v_max_f16_sdwa v31, v12, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3371; VI-NEXT:    v_max_f16_e32 v12, v12, v12
3372; VI-NEXT:    v_or_b32_e32 v12, v12, v31
3373; VI-NEXT:    v_max_f16_sdwa v31, v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3374; VI-NEXT:    v_max_f16_e32 v13, v13, v13
3375; VI-NEXT:    v_or_b32_e32 v13, v13, v31
3376; VI-NEXT:    v_max_f16_sdwa v31, v14, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3377; VI-NEXT:    v_max_f16_e32 v14, v14, v14
3378; VI-NEXT:    v_or_b32_e32 v14, v14, v31
3379; VI-NEXT:    v_max_f16_sdwa v31, v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3380; VI-NEXT:    v_max_f16_e32 v15, v15, v15
3381; VI-NEXT:    v_or_b32_e32 v15, v15, v31
3382; VI-NEXT:    v_max_f16_sdwa v31, v16, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3383; VI-NEXT:    v_max_f16_e32 v16, v16, v16
3384; VI-NEXT:    v_or_b32_e32 v16, v16, v31
3385; VI-NEXT:    v_max_f16_sdwa v31, v17, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3386; VI-NEXT:    v_max_f16_e32 v17, v17, v17
3387; VI-NEXT:    v_or_b32_e32 v17, v17, v31
3388; VI-NEXT:    v_max_f16_sdwa v31, v18, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3389; VI-NEXT:    v_max_f16_e32 v18, v18, v18
3390; VI-NEXT:    v_or_b32_e32 v18, v18, v31
3391; VI-NEXT:    v_max_f16_sdwa v31, v19, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3392; VI-NEXT:    v_max_f16_e32 v19, v19, v19
3393; VI-NEXT:    v_or_b32_e32 v19, v19, v31
3394; VI-NEXT:    v_max_f16_sdwa v31, v20, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3395; VI-NEXT:    v_max_f16_e32 v20, v20, v20
3396; VI-NEXT:    v_or_b32_e32 v20, v20, v31
3397; VI-NEXT:    v_max_f16_sdwa v31, v21, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3398; VI-NEXT:    v_max_f16_e32 v21, v21, v21
3399; VI-NEXT:    v_or_b32_e32 v21, v21, v31
3400; VI-NEXT:    v_max_f16_sdwa v31, v22, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3401; VI-NEXT:    v_max_f16_e32 v22, v22, v22
3402; VI-NEXT:    v_or_b32_e32 v22, v22, v31
3403; VI-NEXT:    v_max_f16_sdwa v31, v23, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3404; VI-NEXT:    v_max_f16_e32 v23, v23, v23
3405; VI-NEXT:    v_or_b32_e32 v23, v23, v31
3406; VI-NEXT:    v_max_f16_sdwa v31, v24, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3407; VI-NEXT:    v_max_f16_e32 v24, v24, v24
3408; VI-NEXT:    v_or_b32_e32 v24, v24, v31
3409; VI-NEXT:    v_max_f16_sdwa v31, v25, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3410; VI-NEXT:    v_max_f16_e32 v25, v25, v25
3411; VI-NEXT:    v_or_b32_e32 v25, v25, v31
3412; VI-NEXT:    v_max_f16_sdwa v31, v26, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3413; VI-NEXT:    v_max_f16_e32 v26, v26, v26
3414; VI-NEXT:    v_or_b32_e32 v26, v26, v31
3415; VI-NEXT:    v_max_f16_sdwa v31, v27, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3416; VI-NEXT:    v_max_f16_e32 v27, v27, v27
3417; VI-NEXT:    v_or_b32_e32 v27, v27, v31
3418; VI-NEXT:    v_max_f16_sdwa v31, v28, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3419; VI-NEXT:    v_max_f16_e32 v28, v28, v28
3420; VI-NEXT:    v_or_b32_e32 v28, v28, v31
3421; VI-NEXT:    v_max_f16_sdwa v31, v29, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3422; VI-NEXT:    v_max_f16_e32 v29, v29, v29
3423; VI-NEXT:    v_or_b32_e32 v29, v29, v31
3424; VI-NEXT:    v_max_f16_sdwa v31, v30, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3425; VI-NEXT:    v_max_f16_e32 v30, v30, v30
3426; VI-NEXT:    v_or_b32_e32 v30, v30, v31
3427; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
3428; VI-NEXT:    s_waitcnt vmcnt(0)
3429; VI-NEXT:    v_max_f16_sdwa v32, v31, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3430; VI-NEXT:    v_max_f16_e32 v31, v31, v31
3431; VI-NEXT:    v_or_b32_e32 v31, v31, v32
3432; VI-NEXT:    s_setpc_b64 s[30:31]
3433;
3434; GFX9-LABEL: v_test_canonicalize_var_v64f16:
3435; GFX9:       ; %bb.0:
3436; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3437; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
3438; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
3439; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
3440; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
3441; GFX9-NEXT:    v_pk_max_f16 v3, v3, v3
3442; GFX9-NEXT:    v_pk_max_f16 v4, v4, v4
3443; GFX9-NEXT:    v_pk_max_f16 v5, v5, v5
3444; GFX9-NEXT:    v_pk_max_f16 v6, v6, v6
3445; GFX9-NEXT:    v_pk_max_f16 v7, v7, v7
3446; GFX9-NEXT:    v_pk_max_f16 v8, v8, v8
3447; GFX9-NEXT:    v_pk_max_f16 v9, v9, v9
3448; GFX9-NEXT:    v_pk_max_f16 v10, v10, v10
3449; GFX9-NEXT:    v_pk_max_f16 v11, v11, v11
3450; GFX9-NEXT:    v_pk_max_f16 v12, v12, v12
3451; GFX9-NEXT:    v_pk_max_f16 v13, v13, v13
3452; GFX9-NEXT:    v_pk_max_f16 v14, v14, v14
3453; GFX9-NEXT:    v_pk_max_f16 v15, v15, v15
3454; GFX9-NEXT:    v_pk_max_f16 v16, v16, v16
3455; GFX9-NEXT:    v_pk_max_f16 v17, v17, v17
3456; GFX9-NEXT:    v_pk_max_f16 v18, v18, v18
3457; GFX9-NEXT:    v_pk_max_f16 v19, v19, v19
3458; GFX9-NEXT:    v_pk_max_f16 v20, v20, v20
3459; GFX9-NEXT:    v_pk_max_f16 v21, v21, v21
3460; GFX9-NEXT:    v_pk_max_f16 v22, v22, v22
3461; GFX9-NEXT:    v_pk_max_f16 v23, v23, v23
3462; GFX9-NEXT:    v_pk_max_f16 v24, v24, v24
3463; GFX9-NEXT:    v_pk_max_f16 v25, v25, v25
3464; GFX9-NEXT:    v_pk_max_f16 v26, v26, v26
3465; GFX9-NEXT:    v_pk_max_f16 v27, v27, v27
3466; GFX9-NEXT:    v_pk_max_f16 v28, v28, v28
3467; GFX9-NEXT:    v_pk_max_f16 v29, v29, v29
3468; GFX9-NEXT:    v_pk_max_f16 v30, v30, v30
3469; GFX9-NEXT:    s_waitcnt vmcnt(0)
3470; GFX9-NEXT:    v_pk_max_f16 v31, v31, v31
3471; GFX9-NEXT:    s_setpc_b64 s[30:31]
3472;
3473; CI-LABEL: v_test_canonicalize_var_v64f16:
3474; CI:       ; %bb.0:
3475; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3476; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132
3477; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
3478; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
3479; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
3480; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
3481; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
3482; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
3483; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
3484; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
3485; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
3486; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
3487; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3488; CI-NEXT:    v_or_b32_e32 v1, v1, v2
3489; CI-NEXT:    v_cvt_f16_f32_e32 v2, v4
3490; CI-NEXT:    v_cvt_f16_f32_e32 v4, v5
3491; CI-NEXT:    v_cvt_f16_f32_e32 v5, v7
3492; CI-NEXT:    v_cvt_f16_f32_e32 v7, v11
3493; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
3494; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
3495; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
3496; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
3497; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
3498; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
3499; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
3500; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
3501; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3502; CI-NEXT:    v_or_b32_e32 v2, v3, v2
3503; CI-NEXT:    v_cvt_f16_f32_e32 v3, v6
3504; CI-NEXT:    v_cvt_f16_f32_e32 v6, v9
3505; CI-NEXT:    v_cvt_f16_f32_e32 v9, v16
3506; CI-NEXT:    v_cvt_f16_f32_e32 v16, v21
3507; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
3508; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
3509; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
3510; CI-NEXT:    v_cvt_f32_f16_e32 v16, v16
3511; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
3512; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
3513; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
3514; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
3515; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3516; CI-NEXT:    v_or_b32_e32 v3, v4, v3
3517; CI-NEXT:    v_cvt_f16_f32_e32 v4, v8
3518; CI-NEXT:    v_cvt_f16_f32_e32 v8, v13
3519; CI-NEXT:    v_cvt_f16_f32_e32 v13, v20
3520; CI-NEXT:    v_cvt_f16_f32_e32 v20, v25
3521; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
3522; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
3523; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
3524; CI-NEXT:    v_cvt_f16_f32_e32 v21, v28
3525; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
3526; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
3527; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
3528; CI-NEXT:    v_cvt_f32_f16_e32 v20, v20
3529; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
3530; CI-NEXT:    v_or_b32_e32 v4, v5, v4
3531; CI-NEXT:    v_cvt_f16_f32_e32 v5, v10
3532; CI-NEXT:    v_cvt_f16_f32_e32 v10, v15
3533; CI-NEXT:    v_cvt_f32_f16_e32 v21, v21
3534; CI-NEXT:    v_cvt_f16_f32_e32 v20, v20
3535; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
3536; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
3537; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
3538; CI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:20
3539; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
3540; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
3541; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
3542; CI-NEXT:    v_or_b32_e32 v5, v6, v5
3543; CI-NEXT:    v_cvt_f16_f32_e32 v6, v12
3544; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:8
3545; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
3546; CI-NEXT:    s_waitcnt vmcnt(3)
3547; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3548; CI-NEXT:    s_waitcnt vmcnt(2)
3549; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3550; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
3551; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
3552; CI-NEXT:    v_cvt_f32_f16_e32 v32, v32
3553; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
3554; CI-NEXT:    v_or_b32_e32 v6, v7, v6
3555; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3556; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3557; CI-NEXT:    v_cvt_f16_f32_e32 v7, v14
3558; CI-NEXT:    v_cvt_f16_f32_e32 v14, v19
3559; CI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
3560; CI-NEXT:    v_or_b32_e32 v31, v32, v31
3561; CI-NEXT:    v_add_i32_e32 v32, vcc, 0x7c, v0
3562; CI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
3563; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124
3564; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
3565; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
3566; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
3567; CI-NEXT:    v_cvt_f16_f32_e32 v19, v26
3568; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
3569; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
3570; CI-NEXT:    v_cvt_f32_f16_e32 v19, v19
3571; CI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
3572; CI-NEXT:    v_or_b32_e32 v7, v8, v7
3573; CI-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
3574; CI-NEXT:    v_cvt_f16_f32_e32 v9, v18
3575; CI-NEXT:    v_or_b32_e32 v8, v10, v8
3576; CI-NEXT:    v_cvt_f16_f32_e32 v10, v17
3577; CI-NEXT:    v_cvt_f16_f32_e32 v17, v24
3578; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
3579; CI-NEXT:    v_cvt_f16_f32_e32 v18, v23
3580; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
3581; CI-NEXT:    v_cvt_f32_f16_e32 v17, v17
3582; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
3583; CI-NEXT:    v_cvt_f32_f16_e32 v18, v18
3584; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
3585; CI-NEXT:    v_cvt_f16_f32_e32 v17, v17
3586; CI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
3587; CI-NEXT:    v_cvt_f16_f32_e32 v18, v18
3588; CI-NEXT:    v_or_b32_e32 v9, v10, v9
3589; CI-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
3590; CI-NEXT:    v_cvt_f16_f32_e32 v13, v22
3591; CI-NEXT:    v_or_b32_e32 v10, v14, v10
3592; CI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
3593; CI-NEXT:    v_or_b32_e32 v17, v18, v17
3594; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
3595; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:16
3596; CI-NEXT:    v_cvt_f16_f32_e32 v22, v27
3597; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
3598; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
3599; CI-NEXT:    v_cvt_f32_f16_e32 v22, v22
3600; CI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
3601; CI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
3602; CI-NEXT:    v_or_b32_e32 v13, v16, v13
3603; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:12
3604; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
3605; CI-NEXT:    v_or_b32_e32 v19, v20, v19
3606; CI-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
3607; CI-NEXT:    v_cvt_f16_f32_e32 v21, v30
3608; CI-NEXT:    v_or_b32_e32 v20, v22, v20
3609; CI-NEXT:    v_cvt_f16_f32_e32 v22, v29
3610; CI-NEXT:    s_waitcnt vmcnt(6)
3611; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
3612; CI-NEXT:    v_cvt_f32_f16_e32 v21, v21
3613; CI-NEXT:    v_cvt_f32_f16_e32 v22, v22
3614; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
3615; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
3616; CI-NEXT:    s_waitcnt vmcnt(5)
3617; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
3618; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
3619; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
3620; CI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
3621; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
3622; CI-NEXT:    v_or_b32_e32 v21, v22, v21
3623; CI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
3624; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
3625; CI-NEXT:    s_waitcnt vmcnt(3)
3626; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3627; CI-NEXT:    s_waitcnt vmcnt(2)
3628; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3629; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
3630; CI-NEXT:    v_cvt_f32_f16_e32 v32, v32
3631; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3632; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3633; CI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
3634; CI-NEXT:    v_or_b32_e32 v31, v32, v31
3635; CI-NEXT:    v_add_i32_e32 v32, vcc, 0x78, v0
3636; CI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
3637; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
3638; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
3639; CI-NEXT:    s_waitcnt vmcnt(1)
3640; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3641; CI-NEXT:    s_waitcnt vmcnt(0)
3642; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3643; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
3644; CI-NEXT:    v_cvt_f32_f16_e32 v32, v32
3645; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3646; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3647; CI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
3648; CI-NEXT:    v_or_b32_e32 v31, v32, v31
3649; CI-NEXT:    v_add_i32_e32 v32, vcc, 0x74, v0
3650; CI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
3651; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:108
3652; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
3653; CI-NEXT:    s_waitcnt vmcnt(1)
3654; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3655; CI-NEXT:    s_waitcnt vmcnt(0)
3656; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3657; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
3658; CI-NEXT:    v_cvt_f32_f16_e32 v32, v32
3659; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3660; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3661; CI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
3662; CI-NEXT:    v_or_b32_e32 v31, v32, v31
3663; CI-NEXT:    v_add_i32_e32 v32, vcc, 0x70, v0
3664; CI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
3665; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
3666; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
3667; CI-NEXT:    s_waitcnt vmcnt(1)
3668; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3669; CI-NEXT:    s_waitcnt vmcnt(0)
3670; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3671; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
3672; CI-NEXT:    v_cvt_f32_f16_e32 v32, v32
3673; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3674; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3675; CI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
3676; CI-NEXT:    v_or_b32_e32 v31, v32, v31
3677; CI-NEXT:    v_add_i32_e32 v32, vcc, 0x6c, v0
3678; CI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
3679; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
3680; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
3681; CI-NEXT:    s_waitcnt vmcnt(1)
3682; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3683; CI-NEXT:    s_waitcnt vmcnt(0)
3684; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3685; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
3686; CI-NEXT:    v_cvt_f32_f16_e32 v32, v32
3687; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3688; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3689; CI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
3690; CI-NEXT:    v_or_b32_e32 v31, v32, v31
3691; CI-NEXT:    v_add_i32_e32 v32, vcc, 0x68, v0
3692; CI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
3693; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
3694; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
3695; CI-NEXT:    s_waitcnt vmcnt(1)
3696; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3697; CI-NEXT:    s_waitcnt vmcnt(0)
3698; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3699; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
3700; CI-NEXT:    v_cvt_f32_f16_e32 v32, v32
3701; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3702; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3703; CI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
3704; CI-NEXT:    v_or_b32_e32 v31, v32, v31
3705; CI-NEXT:    v_add_i32_e32 v32, vcc, 0x64, v0
3706; CI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
3707; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
3708; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
3709; CI-NEXT:    s_waitcnt vmcnt(1)
3710; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3711; CI-NEXT:    s_waitcnt vmcnt(0)
3712; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3713; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
3714; CI-NEXT:    v_cvt_f32_f16_e32 v32, v32
3715; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3716; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3717; CI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
3718; CI-NEXT:    v_or_b32_e32 v31, v32, v31
3719; CI-NEXT:    v_add_i32_e32 v32, vcc, 0x60, v0
3720; CI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
3721; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
3722; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
3723; CI-NEXT:    s_waitcnt vmcnt(1)
3724; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3725; CI-NEXT:    s_waitcnt vmcnt(0)
3726; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3727; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
3728; CI-NEXT:    v_cvt_f32_f16_e32 v32, v32
3729; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3730; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3731; CI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
3732; CI-NEXT:    v_or_b32_e32 v31, v32, v31
3733; CI-NEXT:    v_add_i32_e32 v32, vcc, 0x5c, v0
3734; CI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
3735; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
3736; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
3737; CI-NEXT:    s_waitcnt vmcnt(1)
3738; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3739; CI-NEXT:    s_waitcnt vmcnt(0)
3740; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3741; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
3742; CI-NEXT:    v_cvt_f32_f16_e32 v32, v32
3743; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3744; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3745; CI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
3746; CI-NEXT:    v_or_b32_e32 v31, v32, v31
3747; CI-NEXT:    v_add_i32_e32 v32, vcc, 0x58, v0
3748; CI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
3749; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
3750; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
3751; CI-NEXT:    s_waitcnt vmcnt(1)
3752; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3753; CI-NEXT:    s_waitcnt vmcnt(0)
3754; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3755; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
3756; CI-NEXT:    v_cvt_f32_f16_e32 v32, v32
3757; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3758; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3759; CI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
3760; CI-NEXT:    v_or_b32_e32 v31, v32, v31
3761; CI-NEXT:    v_add_i32_e32 v32, vcc, 0x54, v0
3762; CI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
3763; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
3764; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
3765; CI-NEXT:    s_waitcnt vmcnt(1)
3766; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3767; CI-NEXT:    s_waitcnt vmcnt(0)
3768; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3769; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
3770; CI-NEXT:    v_cvt_f32_f16_e32 v32, v32
3771; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3772; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3773; CI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
3774; CI-NEXT:    v_or_b32_e32 v31, v32, v31
3775; CI-NEXT:    v_add_i32_e32 v32, vcc, 0x50, v0
3776; CI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
3777; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:36
3778; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
3779; CI-NEXT:    s_waitcnt vmcnt(1)
3780; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3781; CI-NEXT:    s_waitcnt vmcnt(0)
3782; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3783; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
3784; CI-NEXT:    v_cvt_f32_f16_e32 v32, v32
3785; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3786; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3787; CI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
3788; CI-NEXT:    v_or_b32_e32 v31, v32, v31
3789; CI-NEXT:    v_add_i32_e32 v32, vcc, 0x4c, v0
3790; CI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
3791; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
3792; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
3793; CI-NEXT:    s_waitcnt vmcnt(1)
3794; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3795; CI-NEXT:    s_waitcnt vmcnt(0)
3796; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3797; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
3798; CI-NEXT:    v_cvt_f32_f16_e32 v32, v32
3799; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
3800; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
3801; CI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
3802; CI-NEXT:    v_or_b32_e32 v31, v32, v31
3803; CI-NEXT:    v_add_i32_e32 v32, vcc, 0x48, v0
3804; CI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
3805; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:4
3806; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32
3807; CI-NEXT:    s_waitcnt vmcnt(1)
3808; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
3809; CI-NEXT:    s_waitcnt vmcnt(0)
3810; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
3811; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
3812; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
3813; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
3814; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
3815; CI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
3816; CI-NEXT:    v_or_b32_e32 v14, v15, v14
3817; CI-NEXT:    v_cvt_f16_f32_e32 v15, v16
3818; CI-NEXT:    v_cvt_f16_f32_e32 v16, v18
3819; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
3820; CI-NEXT:    v_cvt_f32_f16_e32 v16, v16
3821; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
3822; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
3823; CI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
3824; CI-NEXT:    v_or_b32_e32 v12, v12, v15
3825; CI-NEXT:    v_or_b32_e32 v11, v16, v11
3826; CI-NEXT:    v_add_i32_e32 v15, vcc, 0x44, v0
3827; CI-NEXT:    buffer_store_dword v11, v15, s[0:3], 0 offen
3828; CI-NEXT:    v_add_i32_e32 v11, vcc, 64, v0
3829; CI-NEXT:    buffer_store_dword v12, v11, s[0:3], 0 offen
3830; CI-NEXT:    v_add_i32_e32 v11, vcc, 60, v0
3831; CI-NEXT:    buffer_store_dword v14, v11, s[0:3], 0 offen
3832; CI-NEXT:    v_add_i32_e32 v11, vcc, 56, v0
3833; CI-NEXT:    buffer_store_dword v21, v11, s[0:3], 0 offen
3834; CI-NEXT:    v_add_i32_e32 v11, vcc, 52, v0
3835; CI-NEXT:    buffer_store_dword v20, v11, s[0:3], 0 offen
3836; CI-NEXT:    v_add_i32_e32 v11, vcc, 48, v0
3837; CI-NEXT:    buffer_store_dword v19, v11, s[0:3], 0 offen
3838; CI-NEXT:    v_add_i32_e32 v11, vcc, 44, v0
3839; CI-NEXT:    buffer_store_dword v17, v11, s[0:3], 0 offen
3840; CI-NEXT:    v_add_i32_e32 v11, vcc, 40, v0
3841; CI-NEXT:    buffer_store_dword v13, v11, s[0:3], 0 offen
3842; CI-NEXT:    v_add_i32_e32 v11, vcc, 36, v0
3843; CI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
3844; CI-NEXT:    v_add_i32_e32 v10, vcc, 32, v0
3845; CI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
3846; CI-NEXT:    v_add_i32_e32 v9, vcc, 28, v0
3847; CI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
3848; CI-NEXT:    v_add_i32_e32 v8, vcc, 24, v0
3849; CI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
3850; CI-NEXT:    v_add_i32_e32 v7, vcc, 20, v0
3851; CI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
3852; CI-NEXT:    v_add_i32_e32 v6, vcc, 16, v0
3853; CI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
3854; CI-NEXT:    v_add_i32_e32 v5, vcc, 12, v0
3855; CI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
3856; CI-NEXT:    v_add_i32_e32 v4, vcc, 8, v0
3857; CI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
3858; CI-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
3859; CI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
3860; CI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
3861; CI-NEXT:    s_waitcnt vmcnt(0)
3862; CI-NEXT:    s_setpc_b64 s[30:31]
3863;
3864; GFX11-LABEL: v_test_canonicalize_var_v64f16:
3865; GFX11:       ; %bb.0:
3866; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3867; GFX11-NEXT:    scratch_load_b32 v31, off, s32
3868; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
3869; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
3870; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
3871; GFX11-NEXT:    v_pk_max_f16 v3, v3, v3
3872; GFX11-NEXT:    v_pk_max_f16 v4, v4, v4
3873; GFX11-NEXT:    v_pk_max_f16 v5, v5, v5
3874; GFX11-NEXT:    v_pk_max_f16 v6, v6, v6
3875; GFX11-NEXT:    v_pk_max_f16 v7, v7, v7
3876; GFX11-NEXT:    v_pk_max_f16 v8, v8, v8
3877; GFX11-NEXT:    v_pk_max_f16 v9, v9, v9
3878; GFX11-NEXT:    v_pk_max_f16 v10, v10, v10
3879; GFX11-NEXT:    v_pk_max_f16 v11, v11, v11
3880; GFX11-NEXT:    v_pk_max_f16 v12, v12, v12
3881; GFX11-NEXT:    v_pk_max_f16 v13, v13, v13
3882; GFX11-NEXT:    v_pk_max_f16 v14, v14, v14
3883; GFX11-NEXT:    v_pk_max_f16 v15, v15, v15
3884; GFX11-NEXT:    v_pk_max_f16 v16, v16, v16
3885; GFX11-NEXT:    v_pk_max_f16 v17, v17, v17
3886; GFX11-NEXT:    v_pk_max_f16 v18, v18, v18
3887; GFX11-NEXT:    v_pk_max_f16 v19, v19, v19
3888; GFX11-NEXT:    v_pk_max_f16 v20, v20, v20
3889; GFX11-NEXT:    v_pk_max_f16 v21, v21, v21
3890; GFX11-NEXT:    v_pk_max_f16 v22, v22, v22
3891; GFX11-NEXT:    v_pk_max_f16 v23, v23, v23
3892; GFX11-NEXT:    v_pk_max_f16 v24, v24, v24
3893; GFX11-NEXT:    v_pk_max_f16 v25, v25, v25
3894; GFX11-NEXT:    v_pk_max_f16 v26, v26, v26
3895; GFX11-NEXT:    v_pk_max_f16 v27, v27, v27
3896; GFX11-NEXT:    v_pk_max_f16 v28, v28, v28
3897; GFX11-NEXT:    v_pk_max_f16 v29, v29, v29
3898; GFX11-NEXT:    v_pk_max_f16 v30, v30, v30
3899; GFX11-NEXT:    s_waitcnt vmcnt(0)
3900; GFX11-NEXT:    v_pk_max_f16 v31, v31, v31
3901; GFX11-NEXT:    s_setpc_b64 s[30:31]
3902  %canonicalized = call <64 x half> @llvm.canonicalize.v64f16(<64 x half> %val)
3903  ret <64 x half> %canonicalized
3904}
3905
3906attributes #0 = { nounwind readnone }
3907attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
3908attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
3909attributes #3 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
3910