xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX678,GFX6 %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX678,GFX8 %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
6; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
7
8declare float @llvm.fabs.f32(float) #0
9declare float @llvm.canonicalize.f32(float) #0
10declare <2 x float> @llvm.canonicalize.v2f32(<2 x float>) #0
11declare <3 x float> @llvm.canonicalize.v3f32(<3 x float>) #0
12declare <4 x float> @llvm.canonicalize.v4f32(<4 x float>) #0
13declare <8 x float> @llvm.canonicalize.v8f32(<8 x float>) #0
14declare double @llvm.fabs.f64(double) #0
15declare double @llvm.canonicalize.f64(double) #0
16declare <2 x double> @llvm.canonicalize.v2f64(<2 x double>) #0
17declare <3 x double> @llvm.canonicalize.v3f64(<3 x double>) #0
18declare <4 x double> @llvm.canonicalize.v4f64(<4 x double>) #0
19declare half @llvm.canonicalize.f16(half) #0
20declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
21declare i32 @llvm.amdgcn.workitem.id.x() #0
22
23define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 {
24; GFX678-LABEL: v_test_canonicalize_var_f32:
25; GFX678:       ; %bb.0:
26; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
27; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
28; GFX678-NEXT:    v_mov_b32_e32 v0, s0
29; GFX678-NEXT:    v_mov_b32_e32 v1, s1
30; GFX678-NEXT:    flat_load_dword v2, v[0:1]
31; GFX678-NEXT:    s_waitcnt vmcnt(0)
32; GFX678-NEXT:    v_mul_f32_e32 v2, 1.0, v2
33; GFX678-NEXT:    flat_store_dword v[0:1], v2
34; GFX678-NEXT:    s_endpgm
35;
36; GFX9-LABEL: v_test_canonicalize_var_f32:
37; GFX9:       ; %bb.0:
38; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
39; GFX9-NEXT:    v_mov_b32_e32 v0, 0
40; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
41; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
42; GFX9-NEXT:    s_waitcnt vmcnt(0)
43; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
44; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
45; GFX9-NEXT:    s_endpgm
46;
47; GFX11-LABEL: v_test_canonicalize_var_f32:
48; GFX11:       ; %bb.0:
49; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
50; GFX11-NEXT:    v_mov_b32_e32 v0, 0
51; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
52; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1]
53; GFX11-NEXT:    s_waitcnt vmcnt(0)
54; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
55; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
56; GFX11-NEXT:    s_endpgm
57;
58; GFX12-LABEL: v_test_canonicalize_var_f32:
59; GFX12:       ; %bb.0:
60; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
61; GFX12-NEXT:    v_mov_b32_e32 v0, 0
62; GFX12-NEXT:    s_wait_kmcnt 0x0
63; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
64; GFX12-NEXT:    s_wait_loadcnt 0x0
65; GFX12-NEXT:    v_max_num_f32_e32 v1, v1, v1
66; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
67; GFX12-NEXT:    s_endpgm
68  %val = load float, ptr addrspace(1) %out
69  %canonicalized = call float @llvm.canonicalize.f32(float %val)
70  store float %canonicalized, ptr addrspace(1) %out
71  ret void
72}
73
74define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, float %val) #1 {
75; GFX6-LABEL: s_test_canonicalize_var_f32:
76; GFX6:       ; %bb.0:
77; GFX6-NEXT:    s_load_dword s2, s[8:9], 0x2
78; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
79; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
80; GFX6-NEXT:    v_mul_f32_e64 v2, 1.0, s2
81; GFX6-NEXT:    v_mov_b32_e32 v0, s0
82; GFX6-NEXT:    v_mov_b32_e32 v1, s1
83; GFX6-NEXT:    flat_store_dword v[0:1], v2
84; GFX6-NEXT:    s_endpgm
85;
86; GFX8-LABEL: s_test_canonicalize_var_f32:
87; GFX8:       ; %bb.0:
88; GFX8-NEXT:    s_load_dword s2, s[8:9], 0x8
89; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
90; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
91; GFX8-NEXT:    v_mul_f32_e64 v2, 1.0, s2
92; GFX8-NEXT:    v_mov_b32_e32 v0, s0
93; GFX8-NEXT:    v_mov_b32_e32 v1, s1
94; GFX8-NEXT:    flat_store_dword v[0:1], v2
95; GFX8-NEXT:    s_endpgm
96;
97; GFX9-LABEL: s_test_canonicalize_var_f32:
98; GFX9:       ; %bb.0:
99; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
100; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
101; GFX9-NEXT:    v_mov_b32_e32 v0, 0
102; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
103; GFX9-NEXT:    v_max_f32_e64 v1, s2, s2
104; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
105; GFX9-NEXT:    s_endpgm
106;
107; GFX11-LABEL: s_test_canonicalize_var_f32:
108; GFX11:       ; %bb.0:
109; GFX11-NEXT:    s_clause 0x1
110; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
111; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
112; GFX11-NEXT:    v_mov_b32_e32 v0, 0
113; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
114; GFX11-NEXT:    v_max_f32_e64 v1, s2, s2
115; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
116; GFX11-NEXT:    s_endpgm
117;
118; GFX12-LABEL: s_test_canonicalize_var_f32:
119; GFX12:       ; %bb.0:
120; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
121; GFX12-NEXT:    v_mov_b32_e32 v0, 0
122; GFX12-NEXT:    s_wait_kmcnt 0x0
123; GFX12-NEXT:    v_max_num_f32_e64 v1, s2, s2
124; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
125; GFX12-NEXT:    s_endpgm
126  %canonicalized = call float @llvm.canonicalize.f32(float %val)
127  store float %canonicalized, ptr addrspace(1) %out
128  ret void
129}
130
131define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %out) #1 {
132; GFX678-LABEL: v_test_canonicalize_fabs_var_f32:
133; GFX678:       ; %bb.0:
134; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
135; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
136; GFX678-NEXT:    v_mov_b32_e32 v0, s0
137; GFX678-NEXT:    v_mov_b32_e32 v1, s1
138; GFX678-NEXT:    flat_load_dword v2, v[0:1]
139; GFX678-NEXT:    s_waitcnt vmcnt(0)
140; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, |v2|
141; GFX678-NEXT:    flat_store_dword v[0:1], v2
142; GFX678-NEXT:    s_endpgm
143;
144; GFX9-LABEL: v_test_canonicalize_fabs_var_f32:
145; GFX9:       ; %bb.0:
146; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
147; GFX9-NEXT:    v_mov_b32_e32 v0, 0
148; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
149; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
150; GFX9-NEXT:    s_waitcnt vmcnt(0)
151; GFX9-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
152; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
153; GFX9-NEXT:    s_endpgm
154;
155; GFX11-LABEL: v_test_canonicalize_fabs_var_f32:
156; GFX11:       ; %bb.0:
157; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
158; GFX11-NEXT:    v_mov_b32_e32 v0, 0
159; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
160; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1]
161; GFX11-NEXT:    s_waitcnt vmcnt(0)
162; GFX11-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
163; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
164; GFX11-NEXT:    s_endpgm
165;
166; GFX12-LABEL: v_test_canonicalize_fabs_var_f32:
167; GFX12:       ; %bb.0:
168; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
169; GFX12-NEXT:    v_mov_b32_e32 v0, 0
170; GFX12-NEXT:    s_wait_kmcnt 0x0
171; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
172; GFX12-NEXT:    s_wait_loadcnt 0x0
173; GFX12-NEXT:    v_max_num_f32_e64 v1, |v1|, |v1|
174; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
175; GFX12-NEXT:    s_endpgm
176  %val = load float, ptr addrspace(1) %out
177  %val.fabs = call float @llvm.fabs.f32(float %val)
178  %canonicalized = call float @llvm.canonicalize.f32(float %val.fabs)
179  store float %canonicalized, ptr addrspace(1) %out
180  ret void
181}
182
183define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1) %out) #1 {
184; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32:
185; GFX678:       ; %bb.0:
186; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
187; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
188; GFX678-NEXT:    v_mov_b32_e32 v0, s0
189; GFX678-NEXT:    v_mov_b32_e32 v1, s1
190; GFX678-NEXT:    flat_load_dword v2, v[0:1]
191; GFX678-NEXT:    s_waitcnt vmcnt(0)
192; GFX678-NEXT:    v_mul_f32_e64 v2, -1.0, |v2|
193; GFX678-NEXT:    flat_store_dword v[0:1], v2
194; GFX678-NEXT:    s_endpgm
195;
196; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f32:
197; GFX9:       ; %bb.0:
198; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
199; GFX9-NEXT:    v_mov_b32_e32 v0, 0
200; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
201; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
202; GFX9-NEXT:    s_waitcnt vmcnt(0)
203; GFX9-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
204; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
205; GFX9-NEXT:    s_endpgm
206;
207; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f32:
208; GFX11:       ; %bb.0:
209; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
210; GFX11-NEXT:    v_mov_b32_e32 v0, 0
211; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
212; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1]
213; GFX11-NEXT:    s_waitcnt vmcnt(0)
214; GFX11-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
215; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
216; GFX11-NEXT:    s_endpgm
217;
218; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f32:
219; GFX12:       ; %bb.0:
220; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
221; GFX12-NEXT:    v_mov_b32_e32 v0, 0
222; GFX12-NEXT:    s_wait_kmcnt 0x0
223; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
224; GFX12-NEXT:    s_wait_loadcnt 0x0
225; GFX12-NEXT:    v_max_num_f32_e64 v1, -|v1|, -|v1|
226; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
227; GFX12-NEXT:    s_endpgm
228  %val = load float, ptr addrspace(1) %out
229  %val.fabs = call float @llvm.fabs.f32(float %val)
230  %val.fabs.fneg = fneg float %val.fabs
231  %canonicalized = call float @llvm.canonicalize.f32(float %val.fabs.fneg)
232  store float %canonicalized, ptr addrspace(1) %out
233  ret void
234}
235
236define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %out) #1 {
237; GFX678-LABEL: v_test_canonicalize_fneg_var_f32:
238; GFX678:       ; %bb.0:
239; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
240; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
241; GFX678-NEXT:    v_mov_b32_e32 v0, s0
242; GFX678-NEXT:    v_mov_b32_e32 v1, s1
243; GFX678-NEXT:    flat_load_dword v2, v[0:1]
244; GFX678-NEXT:    s_waitcnt vmcnt(0)
245; GFX678-NEXT:    v_mul_f32_e32 v2, -1.0, v2
246; GFX678-NEXT:    flat_store_dword v[0:1], v2
247; GFX678-NEXT:    s_endpgm
248;
249; GFX9-LABEL: v_test_canonicalize_fneg_var_f32:
250; GFX9:       ; %bb.0:
251; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
252; GFX9-NEXT:    v_mov_b32_e32 v0, 0
253; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
254; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
255; GFX9-NEXT:    s_waitcnt vmcnt(0)
256; GFX9-NEXT:    v_max_f32_e64 v1, -v1, -v1
257; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
258; GFX9-NEXT:    s_endpgm
259;
260; GFX11-LABEL: v_test_canonicalize_fneg_var_f32:
261; GFX11:       ; %bb.0:
262; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
263; GFX11-NEXT:    v_mov_b32_e32 v0, 0
264; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
265; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1]
266; GFX11-NEXT:    s_waitcnt vmcnt(0)
267; GFX11-NEXT:    v_max_f32_e64 v1, -v1, -v1
268; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
269; GFX11-NEXT:    s_endpgm
270;
271; GFX12-LABEL: v_test_canonicalize_fneg_var_f32:
272; GFX12:       ; %bb.0:
273; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
274; GFX12-NEXT:    v_mov_b32_e32 v0, 0
275; GFX12-NEXT:    s_wait_kmcnt 0x0
276; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
277; GFX12-NEXT:    s_wait_loadcnt 0x0
278; GFX12-NEXT:    v_max_num_f32_e64 v1, -v1, -v1
279; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
280; GFX12-NEXT:    s_endpgm
281  %val = load float, ptr addrspace(1) %out
282  %val.fneg = fneg float %val
283  %canonicalized = call float @llvm.canonicalize.f32(float %val.fneg)
284  store float %canonicalized, ptr addrspace(1) %out
285  ret void
286}
287
288define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %out) #1 {
289; GFX678-LABEL: test_fold_canonicalize_undef_f32:
290; GFX678:       ; %bb.0:
291; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
292; GFX678-NEXT:    v_mov_b32_e32 v2, 0
293; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
294; GFX678-NEXT:    v_mov_b32_e32 v0, s0
295; GFX678-NEXT:    v_mov_b32_e32 v1, s1
296; GFX678-NEXT:    flat_store_dword v[0:1], v2
297; GFX678-NEXT:    s_endpgm
298;
299; GFX9-LABEL: test_fold_canonicalize_undef_f32:
300; GFX9:       ; %bb.0:
301; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
302; GFX9-NEXT:    v_mov_b32_e32 v0, 0
303; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
304; GFX9-NEXT:    global_store_dword v0, v0, s[0:1]
305; GFX9-NEXT:    s_endpgm
306;
307; GFX11-LABEL: test_fold_canonicalize_undef_f32:
308; GFX11:       ; %bb.0:
309; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
310; GFX11-NEXT:    v_mov_b32_e32 v0, 0
311; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
312; GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
313; GFX11-NEXT:    s_endpgm
314;
315; GFX12-LABEL: test_fold_canonicalize_undef_f32:
316; GFX12:       ; %bb.0:
317; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
318; GFX12-NEXT:    v_mov_b32_e32 v0, 0
319; GFX12-NEXT:    s_wait_kmcnt 0x0
320; GFX12-NEXT:    global_store_b32 v0, v0, s[0:1]
321; GFX12-NEXT:    s_endpgm
322  %canonicalized = call float @llvm.canonicalize.f32(float undef)
323  store float %canonicalized, ptr addrspace(1) %out
324  ret void
325}
326
327define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) #1 {
328; GFX678-LABEL: test_fold_canonicalize_p0_f32:
329; GFX678:       ; %bb.0:
330; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
331; GFX678-NEXT:    v_mov_b32_e32 v2, 0
332; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
333; GFX678-NEXT:    v_mov_b32_e32 v0, s0
334; GFX678-NEXT:    v_mov_b32_e32 v1, s1
335; GFX678-NEXT:    flat_store_dword v[0:1], v2
336; GFX678-NEXT:    s_endpgm
337;
338; GFX9-LABEL: test_fold_canonicalize_p0_f32:
339; GFX9:       ; %bb.0:
340; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
341; GFX9-NEXT:    v_mov_b32_e32 v0, 0
342; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
343; GFX9-NEXT:    global_store_dword v0, v0, s[0:1]
344; GFX9-NEXT:    s_endpgm
345;
346; GFX11-LABEL: test_fold_canonicalize_p0_f32:
347; GFX11:       ; %bb.0:
348; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
349; GFX11-NEXT:    v_mov_b32_e32 v0, 0
350; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
351; GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
352; GFX11-NEXT:    s_endpgm
353;
354; GFX12-LABEL: test_fold_canonicalize_p0_f32:
355; GFX12:       ; %bb.0:
356; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
357; GFX12-NEXT:    v_mov_b32_e32 v0, 0
358; GFX12-NEXT:    s_wait_kmcnt 0x0
359; GFX12-NEXT:    global_store_b32 v0, v0, s[0:1]
360; GFX12-NEXT:    s_endpgm
361  %canonicalized = call float @llvm.canonicalize.f32(float 0.0)
362  store float %canonicalized, ptr addrspace(1) %out
363  ret void
364}
365
366define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) #1 {
367; GFX678-LABEL: test_fold_canonicalize_n0_f32:
368; GFX678:       ; %bb.0:
369; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
370; GFX678-NEXT:    v_bfrev_b32_e32 v2, 1
371; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
372; GFX678-NEXT:    v_mov_b32_e32 v0, s0
373; GFX678-NEXT:    v_mov_b32_e32 v1, s1
374; GFX678-NEXT:    flat_store_dword v[0:1], v2
375; GFX678-NEXT:    s_endpgm
376;
377; GFX9-LABEL: test_fold_canonicalize_n0_f32:
378; GFX9:       ; %bb.0:
379; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
380; GFX9-NEXT:    v_mov_b32_e32 v0, 0
381; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
382; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
383; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
384; GFX9-NEXT:    s_endpgm
385;
386; GFX11-LABEL: test_fold_canonicalize_n0_f32:
387; GFX11:       ; %bb.0:
388; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
389; GFX11-NEXT:    v_mov_b32_e32 v0, 0
390; GFX11-NEXT:    v_bfrev_b32_e32 v1, 1
391; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
392; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
393; GFX11-NEXT:    s_endpgm
394;
395; GFX12-LABEL: test_fold_canonicalize_n0_f32:
396; GFX12:       ; %bb.0:
397; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
398; GFX12-NEXT:    v_mov_b32_e32 v0, 0
399; GFX12-NEXT:    v_bfrev_b32_e32 v1, 1
400; GFX12-NEXT:    s_wait_kmcnt 0x0
401; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
402; GFX12-NEXT:    s_endpgm
403  %canonicalized = call float @llvm.canonicalize.f32(float -0.0)
404  store float %canonicalized, ptr addrspace(1) %out
405  ret void
406}
407
408define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) #1 {
409; GFX678-LABEL: test_fold_canonicalize_p1_f32:
410; GFX678:       ; %bb.0:
411; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
412; GFX678-NEXT:    v_mov_b32_e32 v2, 1.0
413; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
414; GFX678-NEXT:    v_mov_b32_e32 v0, s0
415; GFX678-NEXT:    v_mov_b32_e32 v1, s1
416; GFX678-NEXT:    flat_store_dword v[0:1], v2
417; GFX678-NEXT:    s_endpgm
418;
419; GFX9-LABEL: test_fold_canonicalize_p1_f32:
420; GFX9:       ; %bb.0:
421; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
422; GFX9-NEXT:    v_mov_b32_e32 v0, 0
423; GFX9-NEXT:    v_mov_b32_e32 v1, 1.0
424; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
425; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
426; GFX9-NEXT:    s_endpgm
427;
428; GFX11-LABEL: test_fold_canonicalize_p1_f32:
429; GFX11:       ; %bb.0:
430; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
431; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0
432; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
433; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
434; GFX11-NEXT:    s_endpgm
435;
436; GFX12-LABEL: test_fold_canonicalize_p1_f32:
437; GFX12:       ; %bb.0:
438; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
439; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0
440; GFX12-NEXT:    s_wait_kmcnt 0x0
441; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
442; GFX12-NEXT:    s_endpgm
443  %canonicalized = call float @llvm.canonicalize.f32(float 1.0)
444  store float %canonicalized, ptr addrspace(1) %out
445  ret void
446}
447
448define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) #1 {
449; GFX678-LABEL: test_fold_canonicalize_n1_f32:
450; GFX678:       ; %bb.0:
451; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
452; GFX678-NEXT:    v_mov_b32_e32 v2, -1.0
453; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
454; GFX678-NEXT:    v_mov_b32_e32 v0, s0
455; GFX678-NEXT:    v_mov_b32_e32 v1, s1
456; GFX678-NEXT:    flat_store_dword v[0:1], v2
457; GFX678-NEXT:    s_endpgm
458;
459; GFX9-LABEL: test_fold_canonicalize_n1_f32:
460; GFX9:       ; %bb.0:
461; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
462; GFX9-NEXT:    v_mov_b32_e32 v0, 0
463; GFX9-NEXT:    v_mov_b32_e32 v1, -1.0
464; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
465; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
466; GFX9-NEXT:    s_endpgm
467;
468; GFX11-LABEL: test_fold_canonicalize_n1_f32:
469; GFX11:       ; %bb.0:
470; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
471; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0
472; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
473; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
474; GFX11-NEXT:    s_endpgm
475;
476; GFX12-LABEL: test_fold_canonicalize_n1_f32:
477; GFX12:       ; %bb.0:
478; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
479; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0
480; GFX12-NEXT:    s_wait_kmcnt 0x0
481; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
482; GFX12-NEXT:    s_endpgm
483  %canonicalized = call float @llvm.canonicalize.f32(float -1.0)
484  store float %canonicalized, ptr addrspace(1) %out
485  ret void
486}
487
488define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %out) #1 {
489; GFX678-LABEL: test_fold_canonicalize_literal_f32:
490; GFX678:       ; %bb.0:
491; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
492; GFX678-NEXT:    v_mov_b32_e32 v2, 0x41800000
493; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
494; GFX678-NEXT:    v_mov_b32_e32 v0, s0
495; GFX678-NEXT:    v_mov_b32_e32 v1, s1
496; GFX678-NEXT:    flat_store_dword v[0:1], v2
497; GFX678-NEXT:    s_endpgm
498;
499; GFX9-LABEL: test_fold_canonicalize_literal_f32:
500; GFX9:       ; %bb.0:
501; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
502; GFX9-NEXT:    v_mov_b32_e32 v0, 0
503; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41800000
504; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
505; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
506; GFX9-NEXT:    s_endpgm
507;
508; GFX11-LABEL: test_fold_canonicalize_literal_f32:
509; GFX11:       ; %bb.0:
510; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
511; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000
512; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
513; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
514; GFX11-NEXT:    s_endpgm
515;
516; GFX12-LABEL: test_fold_canonicalize_literal_f32:
517; GFX12:       ; %bb.0:
518; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
519; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000
520; GFX12-NEXT:    s_wait_kmcnt 0x0
521; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
522; GFX12-NEXT:    s_endpgm
523  %canonicalized = call float @llvm.canonicalize.f32(float 16.0)
524  store float %canonicalized, ptr addrspace(1) %out
525  ret void
526}
527
528define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #1 {
529; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
530; GFX678:       ; %bb.0:
531; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
532; GFX678-NEXT:    v_mov_b32_e32 v2, 0
533; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
534; GFX678-NEXT:    v_mov_b32_e32 v0, s0
535; GFX678-NEXT:    v_mov_b32_e32 v1, s1
536; GFX678-NEXT:    flat_store_dword v[0:1], v2
537; GFX678-NEXT:    s_endpgm
538;
539; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
540; GFX9:       ; %bb.0:
541; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
542; GFX9-NEXT:    v_mov_b32_e32 v0, 0
543; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
544; GFX9-NEXT:    global_store_dword v0, v0, s[0:1]
545; GFX9-NEXT:    s_endpgm
546;
547; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
548; GFX11:       ; %bb.0:
549; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
550; GFX11-NEXT:    v_mov_b32_e32 v0, 0
551; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
552; GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
553; GFX11-NEXT:    s_endpgm
554;
555; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
556; GFX12:       ; %bb.0:
557; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
558; GFX12-NEXT:    v_mov_b32_e32 v0, 0
559; GFX12-NEXT:    s_wait_kmcnt 0x0
560; GFX12-NEXT:    global_store_b32 v0, v0, s[0:1]
561; GFX12-NEXT:    s_endpgm
562  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
563  store float %canonicalized, ptr addrspace(1) %out
564  ret void
565}
566
567define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic(ptr addrspace(1) %out) #5 {
568; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
569; GFX678:       ; %bb.0:
570; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
571; GFX678-NEXT:    s_mov_b32 s2, 0x7fffff
572; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
573; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
574; GFX678-NEXT:    v_mov_b32_e32 v0, s0
575; GFX678-NEXT:    v_mov_b32_e32 v1, s1
576; GFX678-NEXT:    flat_store_dword v[0:1], v2
577; GFX678-NEXT:    s_endpgm
578;
579; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
580; GFX9:       ; %bb.0:
581; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
582; GFX9-NEXT:    s_mov_b32 s2, 0x7fffff
583; GFX9-NEXT:    v_mov_b32_e32 v0, 0
584; GFX9-NEXT:    v_max_f32_e64 v1, s2, s2
585; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
586; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
587; GFX9-NEXT:    s_endpgm
588;
589; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
590; GFX11:       ; %bb.0:
591; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
592; GFX11-NEXT:    v_mov_b32_e32 v0, 0
593; GFX11-NEXT:    v_max_f32_e64 v1, 0x7fffff, 0x7fffff
594; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
595; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
596; GFX11-NEXT:    s_endpgm
597;
598; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
599; GFX12:       ; %bb.0:
600; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
601; GFX12-NEXT:    v_mov_b32_e32 v0, 0
602; GFX12-NEXT:    v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff
603; GFX12-NEXT:    s_wait_kmcnt 0x0
604; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
605; GFX12-NEXT:    s_endpgm
606  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
607  store float %canonicalized, ptr addrspace(1) %out
608  ret void
609}
610
611define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out(ptr addrspace(1) %out) #6 {
612; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out:
613; GFX678:       ; %bb.0:
614; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
615; GFX678-NEXT:    s_mov_b32 s2, 0x7fffff
616; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
617; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
618; GFX678-NEXT:    v_mov_b32_e32 v0, s0
619; GFX678-NEXT:    v_mov_b32_e32 v1, s1
620; GFX678-NEXT:    flat_store_dword v[0:1], v2
621; GFX678-NEXT:    s_endpgm
622;
623; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out:
624; GFX9:       ; %bb.0:
625; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
626; GFX9-NEXT:    s_mov_b32 s2, 0x7fffff
627; GFX9-NEXT:    v_mov_b32_e32 v0, 0
628; GFX9-NEXT:    v_max_f32_e64 v1, s2, s2
629; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
630; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
631; GFX9-NEXT:    s_endpgm
632;
633; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out:
634; GFX11:       ; %bb.0:
635; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
636; GFX11-NEXT:    v_mov_b32_e32 v0, 0
637; GFX11-NEXT:    v_max_f32_e64 v1, 0x7fffff, 0x7fffff
638; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
639; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
640; GFX11-NEXT:    s_endpgm
641;
642; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out:
643; GFX12:       ; %bb.0:
644; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
645; GFX12-NEXT:    v_mov_b32_e32 v0, 0
646; GFX12-NEXT:    v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff
647; GFX12-NEXT:    s_wait_kmcnt 0x0
648; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
649; GFX12-NEXT:    s_endpgm
650  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
651  store float %canonicalized, ptr addrspace(1) %out
652  ret void
653}
654
655define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in(ptr addrspace(1) %out) #7 {
656; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in:
657; GFX678:       ; %bb.0:
658; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
659; GFX678-NEXT:    s_mov_b32 s2, 0x7fffff
660; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
661; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
662; GFX678-NEXT:    v_mov_b32_e32 v0, s0
663; GFX678-NEXT:    v_mov_b32_e32 v1, s1
664; GFX678-NEXT:    flat_store_dword v[0:1], v2
665; GFX678-NEXT:    s_endpgm
666;
667; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in:
668; GFX9:       ; %bb.0:
669; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
670; GFX9-NEXT:    s_mov_b32 s2, 0x7fffff
671; GFX9-NEXT:    v_mov_b32_e32 v0, 0
672; GFX9-NEXT:    v_max_f32_e64 v1, s2, s2
673; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
674; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
675; GFX9-NEXT:    s_endpgm
676;
677; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in:
678; GFX11:       ; %bb.0:
679; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
680; GFX11-NEXT:    v_mov_b32_e32 v0, 0
681; GFX11-NEXT:    v_max_f32_e64 v1, 0x7fffff, 0x7fffff
682; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
683; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
684; GFX11-NEXT:    s_endpgm
685;
686; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in:
687; GFX12:       ; %bb.0:
688; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
689; GFX12-NEXT:    v_mov_b32_e32 v0, 0
690; GFX12-NEXT:    v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff
691; GFX12-NEXT:    s_wait_kmcnt 0x0
692; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
693; GFX12-NEXT:    s_endpgm
694  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
695  store float %canonicalized, ptr addrspace(1) %out
696  ret void
697}
698
699define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #3 {
700; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32:
701; GFX678:       ; %bb.0:
702; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
703; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fffff
704; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
705; GFX678-NEXT:    v_mov_b32_e32 v0, s0
706; GFX678-NEXT:    v_mov_b32_e32 v1, s1
707; GFX678-NEXT:    flat_store_dword v[0:1], v2
708; GFX678-NEXT:    s_endpgm
709;
710; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f32:
711; GFX9:       ; %bb.0:
712; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
713; GFX9-NEXT:    v_mov_b32_e32 v0, 0
714; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fffff
715; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
716; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
717; GFX9-NEXT:    s_endpgm
718;
719; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f32:
720; GFX11:       ; %bb.0:
721; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
722; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff
723; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
724; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
725; GFX11-NEXT:    s_endpgm
726;
727; GFX12-LABEL: test_denormals_fold_canonicalize_denormal0_f32:
728; GFX12:       ; %bb.0:
729; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
730; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff
731; GFX12-NEXT:    s_wait_kmcnt 0x0
732; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
733; GFX12-NEXT:    s_endpgm
734  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
735  store float %canonicalized, ptr addrspace(1) %out
736  ret void
737}
738
739define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr addrspace(1) %out) #1 {
740; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32:
741; GFX678:       ; %bb.0:
742; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
743; GFX678-NEXT:    v_bfrev_b32_e32 v2, 1
744; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
745; GFX678-NEXT:    v_mov_b32_e32 v0, s0
746; GFX678-NEXT:    v_mov_b32_e32 v1, s1
747; GFX678-NEXT:    flat_store_dword v[0:1], v2
748; GFX678-NEXT:    s_endpgm
749;
750; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32:
751; GFX9:       ; %bb.0:
752; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
753; GFX9-NEXT:    v_mov_b32_e32 v0, 0
754; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
755; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
756; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
757; GFX9-NEXT:    s_endpgm
758;
759; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32:
760; GFX11:       ; %bb.0:
761; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
762; GFX11-NEXT:    v_mov_b32_e32 v0, 0
763; GFX11-NEXT:    v_bfrev_b32_e32 v1, 1
764; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
765; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
766; GFX11-NEXT:    s_endpgm
767;
768; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32:
769; GFX12:       ; %bb.0:
770; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
771; GFX12-NEXT:    v_mov_b32_e32 v0, 0
772; GFX12-NEXT:    v_bfrev_b32_e32 v1, 1
773; GFX12-NEXT:    s_wait_kmcnt 0x0
774; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
775; GFX12-NEXT:    s_endpgm
776  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
777  store float %canonicalized, ptr addrspace(1) %out
778  ret void
779}
780
781define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr addrspace(1) %out) #3 {
782; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32:
783; GFX678:       ; %bb.0:
784; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
785; GFX678-NEXT:    v_mov_b32_e32 v2, 0x807fffff
786; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
787; GFX678-NEXT:    v_mov_b32_e32 v0, s0
788; GFX678-NEXT:    v_mov_b32_e32 v1, s1
789; GFX678-NEXT:    flat_store_dword v[0:1], v2
790; GFX678-NEXT:    s_endpgm
791;
792; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f32:
793; GFX9:       ; %bb.0:
794; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
795; GFX9-NEXT:    v_mov_b32_e32 v0, 0
796; GFX9-NEXT:    v_mov_b32_e32 v1, 0x807fffff
797; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
798; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
799; GFX9-NEXT:    s_endpgm
800;
801; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f32:
802; GFX11:       ; %bb.0:
803; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
804; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff
805; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
806; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
807; GFX11-NEXT:    s_endpgm
808;
809; GFX12-LABEL: test_denormals_fold_canonicalize_denormal1_f32:
810; GFX12:       ; %bb.0:
811; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
812; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff
813; GFX12-NEXT:    s_wait_kmcnt 0x0
814; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
815; GFX12-NEXT:    s_endpgm
816  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
817  store float %canonicalized, ptr addrspace(1) %out
818  ret void
819}
820
821define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out) #1 {
822; GFX678-LABEL: test_fold_canonicalize_qnan_f32:
823; GFX678:       ; %bb.0:
824; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
825; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
826; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
827; GFX678-NEXT:    v_mov_b32_e32 v0, s0
828; GFX678-NEXT:    v_mov_b32_e32 v1, s1
829; GFX678-NEXT:    flat_store_dword v[0:1], v2
830; GFX678-NEXT:    s_endpgm
831;
832; GFX9-LABEL: test_fold_canonicalize_qnan_f32:
833; GFX9:       ; %bb.0:
834; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
835; GFX9-NEXT:    v_mov_b32_e32 v0, 0
836; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
837; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
838; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
839; GFX9-NEXT:    s_endpgm
840;
841; GFX11-LABEL: test_fold_canonicalize_qnan_f32:
842; GFX11:       ; %bb.0:
843; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
844; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
845; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
846; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
847; GFX11-NEXT:    s_endpgm
848;
849; GFX12-LABEL: test_fold_canonicalize_qnan_f32:
850; GFX12:       ; %bb.0:
851; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
852; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
853; GFX12-NEXT:    s_wait_kmcnt 0x0
854; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
855; GFX12-NEXT:    s_endpgm
856  %canonicalized = call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
857  store float %canonicalized, ptr addrspace(1) %out
858  ret void
859}
860
861define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrspace(1) %out) #1 {
862; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32:
863; GFX678:       ; %bb.0:
864; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
865; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
866; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
867; GFX678-NEXT:    v_mov_b32_e32 v0, s0
868; GFX678-NEXT:    v_mov_b32_e32 v1, s1
869; GFX678-NEXT:    flat_store_dword v[0:1], v2
870; GFX678-NEXT:    s_endpgm
871;
872; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f32:
873; GFX9:       ; %bb.0:
874; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
875; GFX9-NEXT:    v_mov_b32_e32 v0, 0
876; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
877; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
878; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
879; GFX9-NEXT:    s_endpgm
880;
881; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f32:
882; GFX11:       ; %bb.0:
883; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
884; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
885; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
886; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
887; GFX11-NEXT:    s_endpgm
888;
889; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg1_f32:
890; GFX12:       ; %bb.0:
891; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
892; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
893; GFX12-NEXT:    s_wait_kmcnt 0x0
894; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
895; GFX12-NEXT:    s_endpgm
896  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -1 to float))
897  store float %canonicalized, ptr addrspace(1) %out
898  ret void
899}
900
901define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrspace(1) %out) #1 {
902; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32:
903; GFX678:       ; %bb.0:
904; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
905; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
906; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
907; GFX678-NEXT:    v_mov_b32_e32 v0, s0
908; GFX678-NEXT:    v_mov_b32_e32 v1, s1
909; GFX678-NEXT:    flat_store_dword v[0:1], v2
910; GFX678-NEXT:    s_endpgm
911;
912; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f32:
913; GFX9:       ; %bb.0:
914; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
915; GFX9-NEXT:    v_mov_b32_e32 v0, 0
916; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
917; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
918; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
919; GFX9-NEXT:    s_endpgm
920;
921; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f32:
922; GFX11:       ; %bb.0:
923; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
924; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
925; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
926; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
927; GFX11-NEXT:    s_endpgm
928;
929; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg2_f32:
930; GFX12:       ; %bb.0:
931; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
932; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
933; GFX12-NEXT:    s_wait_kmcnt 0x0
934; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
935; GFX12-NEXT:    s_endpgm
936  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -2 to float))
937  store float %canonicalized, ptr addrspace(1) %out
938  ret void
939}
940
941define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace(1) %out) #1 {
942; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32:
943; GFX678:       ; %bb.0:
944; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
945; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
946; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
947; GFX678-NEXT:    v_mov_b32_e32 v0, s0
948; GFX678-NEXT:    v_mov_b32_e32 v1, s1
949; GFX678-NEXT:    flat_store_dword v[0:1], v2
950; GFX678-NEXT:    s_endpgm
951;
952; GFX9-LABEL: test_fold_canonicalize_snan0_value_f32:
953; GFX9:       ; %bb.0:
954; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
955; GFX9-NEXT:    v_mov_b32_e32 v0, 0
956; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
957; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
958; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
959; GFX9-NEXT:    s_endpgm
960;
961; GFX11-LABEL: test_fold_canonicalize_snan0_value_f32:
962; GFX11:       ; %bb.0:
963; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
964; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
965; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
966; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
967; GFX11-NEXT:    s_endpgm
968;
969; GFX12-LABEL: test_fold_canonicalize_snan0_value_f32:
970; GFX12:       ; %bb.0:
971; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
972; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
973; GFX12-NEXT:    s_wait_kmcnt 0x0
974; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
975; GFX12-NEXT:    s_endpgm
976  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2139095041 to float))
977  store float %canonicalized, ptr addrspace(1) %out
978  ret void
979}
980
981define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace(1) %out) #1 {
982; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32:
983; GFX678:       ; %bb.0:
984; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
985; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
986; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
987; GFX678-NEXT:    v_mov_b32_e32 v0, s0
988; GFX678-NEXT:    v_mov_b32_e32 v1, s1
989; GFX678-NEXT:    flat_store_dword v[0:1], v2
990; GFX678-NEXT:    s_endpgm
991;
992; GFX9-LABEL: test_fold_canonicalize_snan1_value_f32:
993; GFX9:       ; %bb.0:
994; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
995; GFX9-NEXT:    v_mov_b32_e32 v0, 0
996; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
997; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
998; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
999; GFX9-NEXT:    s_endpgm
1000;
1001; GFX11-LABEL: test_fold_canonicalize_snan1_value_f32:
1002; GFX11:       ; %bb.0:
1003; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1004; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
1005; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1006; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1007; GFX11-NEXT:    s_endpgm
1008;
1009; GFX12-LABEL: test_fold_canonicalize_snan1_value_f32:
1010; GFX12:       ; %bb.0:
1011; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1012; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
1013; GFX12-NEXT:    s_wait_kmcnt 0x0
1014; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
1015; GFX12-NEXT:    s_endpgm
1016  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2143289343 to float))
1017  store float %canonicalized, ptr addrspace(1) %out
1018  ret void
1019}
1020
1021define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace(1) %out) #1 {
1022; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32:
1023; GFX678:       ; %bb.0:
1024; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1025; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
1026; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
1027; GFX678-NEXT:    v_mov_b32_e32 v0, s0
1028; GFX678-NEXT:    v_mov_b32_e32 v1, s1
1029; GFX678-NEXT:    flat_store_dword v[0:1], v2
1030; GFX678-NEXT:    s_endpgm
1031;
1032; GFX9-LABEL: test_fold_canonicalize_snan2_value_f32:
1033; GFX9:       ; %bb.0:
1034; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1035; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1036; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
1037; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1038; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1039; GFX9-NEXT:    s_endpgm
1040;
1041; GFX11-LABEL: test_fold_canonicalize_snan2_value_f32:
1042; GFX11:       ; %bb.0:
1043; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1044; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
1045; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1046; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1047; GFX11-NEXT:    s_endpgm
1048;
1049; GFX12-LABEL: test_fold_canonicalize_snan2_value_f32:
1050; GFX12:       ; %bb.0:
1051; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1052; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
1053; GFX12-NEXT:    s_wait_kmcnt 0x0
1054; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
1055; GFX12-NEXT:    s_endpgm
1056  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4286578689 to float))
1057  store float %canonicalized, ptr addrspace(1) %out
1058  ret void
1059}
1060
1061define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace(1) %out) #1 {
1062; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32:
1063; GFX678:       ; %bb.0:
1064; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1065; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
1066; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
1067; GFX678-NEXT:    v_mov_b32_e32 v0, s0
1068; GFX678-NEXT:    v_mov_b32_e32 v1, s1
1069; GFX678-NEXT:    flat_store_dword v[0:1], v2
1070; GFX678-NEXT:    s_endpgm
1071;
1072; GFX9-LABEL: test_fold_canonicalize_snan3_value_f32:
1073; GFX9:       ; %bb.0:
1074; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1075; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1076; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
1077; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1078; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1079; GFX9-NEXT:    s_endpgm
1080;
1081; GFX11-LABEL: test_fold_canonicalize_snan3_value_f32:
1082; GFX11:       ; %bb.0:
1083; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1084; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
1085; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1086; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1087; GFX11-NEXT:    s_endpgm
1088;
1089; GFX12-LABEL: test_fold_canonicalize_snan3_value_f32:
1090; GFX12:       ; %bb.0:
1091; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1092; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000
1093; GFX12-NEXT:    s_wait_kmcnt 0x0
1094; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
1095; GFX12-NEXT:    s_endpgm
1096  %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4290772991 to float))
1097  store float %canonicalized, ptr addrspace(1) %out
1098  ret void
1099}
1100
1101define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 {
1102; GFX678-LABEL: v_test_canonicalize_var_f64:
1103; GFX678:       ; %bb.0:
1104; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1105; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
1106; GFX678-NEXT:    v_mov_b32_e32 v0, s0
1107; GFX678-NEXT:    v_mov_b32_e32 v1, s1
1108; GFX678-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
1109; GFX678-NEXT:    s_waitcnt vmcnt(0)
1110; GFX678-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
1111; GFX678-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1112; GFX678-NEXT:    s_endpgm
1113;
1114; GFX9-LABEL: v_test_canonicalize_var_f64:
1115; GFX9:       ; %bb.0:
1116; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1117; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1118; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1119; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
1120; GFX9-NEXT:    s_waitcnt vmcnt(0)
1121; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
1122; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1123; GFX9-NEXT:    s_endpgm
1124;
1125; GFX11-LABEL: v_test_canonicalize_var_f64:
1126; GFX11:       ; %bb.0:
1127; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1128; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1129; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1130; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[0:1]
1131; GFX11-NEXT:    s_waitcnt vmcnt(0)
1132; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
1133; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1134; GFX11-NEXT:    s_endpgm
1135;
1136; GFX12-LABEL: v_test_canonicalize_var_f64:
1137; GFX12:       ; %bb.0:
1138; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1139; GFX12-NEXT:    v_mov_b32_e32 v2, 0
1140; GFX12-NEXT:    s_wait_kmcnt 0x0
1141; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1]
1142; GFX12-NEXT:    s_wait_loadcnt 0x0
1143; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
1144; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1145; GFX12-NEXT:    s_endpgm
1146  %val = load double, ptr addrspace(1) %out
1147  %canonicalized = call double @llvm.canonicalize.f64(double %val)
1148  store double %canonicalized, ptr addrspace(1) %out
1149  ret void
1150}
1151
1152define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, double %val) #1 {
1153; GFX6-LABEL: s_test_canonicalize_var_f64:
1154; GFX6:       ; %bb.0:
1155; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1156; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1157; GFX6-NEXT:    v_max_f64 v[2:3], s[2:3], s[2:3]
1158; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1159; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1160; GFX6-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1161; GFX6-NEXT:    s_endpgm
1162;
1163; GFX8-LABEL: s_test_canonicalize_var_f64:
1164; GFX8:       ; %bb.0:
1165; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1166; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1167; GFX8-NEXT:    v_max_f64 v[0:1], s[2:3], s[2:3]
1168; GFX8-NEXT:    v_mov_b32_e32 v2, s0
1169; GFX8-NEXT:    v_mov_b32_e32 v3, s1
1170; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1171; GFX8-NEXT:    s_endpgm
1172;
1173; GFX9-LABEL: s_test_canonicalize_var_f64:
1174; GFX9:       ; %bb.0:
1175; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1176; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1177; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1178; GFX9-NEXT:    v_max_f64 v[0:1], s[2:3], s[2:3]
1179; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1180; GFX9-NEXT:    s_endpgm
1181;
1182; GFX11-LABEL: s_test_canonicalize_var_f64:
1183; GFX11:       ; %bb.0:
1184; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1185; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1186; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1187; GFX11-NEXT:    v_max_f64 v[0:1], s[2:3], s[2:3]
1188; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1189; GFX11-NEXT:    s_endpgm
1190;
1191; GFX12-LABEL: s_test_canonicalize_var_f64:
1192; GFX12:       ; %bb.0:
1193; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1194; GFX12-NEXT:    v_mov_b32_e32 v2, 0
1195; GFX12-NEXT:    s_wait_kmcnt 0x0
1196; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], s[2:3], s[2:3]
1197; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1198; GFX12-NEXT:    s_endpgm
1199  %canonicalized = call double @llvm.canonicalize.f64(double %val)
1200  store double %canonicalized, ptr addrspace(1) %out
1201  ret void
1202}
1203
1204define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %out) #1 {
1205; GFX678-LABEL: v_test_canonicalize_fabs_var_f64:
1206; GFX678:       ; %bb.0:
1207; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1208; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
1209; GFX678-NEXT:    v_mov_b32_e32 v0, s0
1210; GFX678-NEXT:    v_mov_b32_e32 v1, s1
1211; GFX678-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
1212; GFX678-NEXT:    s_waitcnt vmcnt(0)
1213; GFX678-NEXT:    v_max_f64 v[2:3], |v[2:3]|, |v[2:3]|
1214; GFX678-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1215; GFX678-NEXT:    s_endpgm
1216;
1217; GFX9-LABEL: v_test_canonicalize_fabs_var_f64:
1218; GFX9:       ; %bb.0:
1219; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1220; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1221; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1222; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
1223; GFX9-NEXT:    s_waitcnt vmcnt(0)
1224; GFX9-NEXT:    v_max_f64 v[0:1], |v[0:1]|, |v[0:1]|
1225; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1226; GFX9-NEXT:    s_endpgm
1227;
1228; GFX11-LABEL: v_test_canonicalize_fabs_var_f64:
1229; GFX11:       ; %bb.0:
1230; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1231; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1232; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1233; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[0:1]
1234; GFX11-NEXT:    s_waitcnt vmcnt(0)
1235; GFX11-NEXT:    v_max_f64 v[0:1], |v[0:1]|, |v[0:1]|
1236; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1237; GFX11-NEXT:    s_endpgm
1238;
1239; GFX12-LABEL: v_test_canonicalize_fabs_var_f64:
1240; GFX12:       ; %bb.0:
1241; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1242; GFX12-NEXT:    v_mov_b32_e32 v2, 0
1243; GFX12-NEXT:    s_wait_kmcnt 0x0
1244; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1]
1245; GFX12-NEXT:    s_wait_loadcnt 0x0
1246; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], |v[0:1]|, |v[0:1]|
1247; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1248; GFX12-NEXT:    s_endpgm
1249  %val = load double, ptr addrspace(1) %out
1250  %val.fabs = call double @llvm.fabs.f64(double %val)
1251  %canonicalized = call double @llvm.canonicalize.f64(double %val.fabs)
1252  store double %canonicalized, ptr addrspace(1) %out
1253  ret void
1254}
1255
1256define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1) %out) #1 {
1257; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64:
1258; GFX678:       ; %bb.0:
1259; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1260; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
1261; GFX678-NEXT:    v_mov_b32_e32 v0, s0
1262; GFX678-NEXT:    v_mov_b32_e32 v1, s1
1263; GFX678-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
1264; GFX678-NEXT:    s_waitcnt vmcnt(0)
1265; GFX678-NEXT:    v_max_f64 v[2:3], -|v[2:3]|, -|v[2:3]|
1266; GFX678-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1267; GFX678-NEXT:    s_endpgm
1268;
1269; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f64:
1270; GFX9:       ; %bb.0:
1271; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1272; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1273; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1274; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
1275; GFX9-NEXT:    s_waitcnt vmcnt(0)
1276; GFX9-NEXT:    v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]|
1277; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1278; GFX9-NEXT:    s_endpgm
1279;
1280; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f64:
1281; GFX11:       ; %bb.0:
1282; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1283; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1284; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1285; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[0:1]
1286; GFX11-NEXT:    s_waitcnt vmcnt(0)
1287; GFX11-NEXT:    v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]|
1288; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1289; GFX11-NEXT:    s_endpgm
1290;
1291; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f64:
1292; GFX12:       ; %bb.0:
1293; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1294; GFX12-NEXT:    v_mov_b32_e32 v2, 0
1295; GFX12-NEXT:    s_wait_kmcnt 0x0
1296; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1]
1297; GFX12-NEXT:    s_wait_loadcnt 0x0
1298; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], -|v[0:1]|, -|v[0:1]|
1299; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1300; GFX12-NEXT:    s_endpgm
1301  %val = load double, ptr addrspace(1) %out
1302  %val.fabs = call double @llvm.fabs.f64(double %val)
1303  %val.fabs.fneg = fneg double %val.fabs
1304  %canonicalized = call double @llvm.canonicalize.f64(double %val.fabs.fneg)
1305  store double %canonicalized, ptr addrspace(1) %out
1306  ret void
1307}
1308
1309define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %out) #1 {
1310; GFX678-LABEL: v_test_canonicalize_fneg_var_f64:
1311; GFX678:       ; %bb.0:
1312; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1313; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
1314; GFX678-NEXT:    v_mov_b32_e32 v0, s0
1315; GFX678-NEXT:    v_mov_b32_e32 v1, s1
1316; GFX678-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
1317; GFX678-NEXT:    s_waitcnt vmcnt(0)
1318; GFX678-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3]
1319; GFX678-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1320; GFX678-NEXT:    s_endpgm
1321;
1322; GFX9-LABEL: v_test_canonicalize_fneg_var_f64:
1323; GFX9:       ; %bb.0:
1324; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1325; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1326; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1327; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
1328; GFX9-NEXT:    s_waitcnt vmcnt(0)
1329; GFX9-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
1330; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1331; GFX9-NEXT:    s_endpgm
1332;
1333; GFX11-LABEL: v_test_canonicalize_fneg_var_f64:
1334; GFX11:       ; %bb.0:
1335; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1336; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1337; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1338; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[0:1]
1339; GFX11-NEXT:    s_waitcnt vmcnt(0)
1340; GFX11-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
1341; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1342; GFX11-NEXT:    s_endpgm
1343;
1344; GFX12-LABEL: v_test_canonicalize_fneg_var_f64:
1345; GFX12:       ; %bb.0:
1346; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1347; GFX12-NEXT:    v_mov_b32_e32 v2, 0
1348; GFX12-NEXT:    s_wait_kmcnt 0x0
1349; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1]
1350; GFX12-NEXT:    s_wait_loadcnt 0x0
1351; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], -v[0:1], -v[0:1]
1352; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1353; GFX12-NEXT:    s_endpgm
1354  %val = load double, ptr addrspace(1) %out
1355  %val.fneg = fneg double %val
1356  %canonicalized = call double @llvm.canonicalize.f64(double %val.fneg)
1357  store double %canonicalized, ptr addrspace(1) %out
1358  ret void
1359}
1360
1361define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) #1 {
1362; GFX678-LABEL: test_fold_canonicalize_p0_f64:
1363; GFX678:       ; %bb.0:
1364; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1365; GFX678-NEXT:    v_mov_b32_e32 v0, 0
1366; GFX678-NEXT:    v_mov_b32_e32 v1, v0
1367; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
1368; GFX678-NEXT:    v_mov_b32_e32 v3, s1
1369; GFX678-NEXT:    v_mov_b32_e32 v2, s0
1370; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1371; GFX678-NEXT:    s_endpgm
1372;
1373; GFX9-LABEL: test_fold_canonicalize_p0_f64:
1374; GFX9:       ; %bb.0:
1375; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1376; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1377; GFX9-NEXT:    v_mov_b32_e32 v1, v0
1378; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1379; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
1380; GFX9-NEXT:    s_endpgm
1381;
1382; GFX11-LABEL: test_fold_canonicalize_p0_f64:
1383; GFX11:       ; %bb.0:
1384; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1385; GFX11-NEXT:    v_mov_b32_e32 v0, 0
1386; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1387; GFX11-NEXT:    v_mov_b32_e32 v1, v0
1388; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1389; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1390; GFX11-NEXT:    s_endpgm
1391;
1392; GFX12-LABEL: test_fold_canonicalize_p0_f64:
1393; GFX12:       ; %bb.0:
1394; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1395; GFX12-NEXT:    v_mov_b32_e32 v0, 0
1396; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1397; GFX12-NEXT:    v_mov_b32_e32 v1, v0
1398; GFX12-NEXT:    s_wait_kmcnt 0x0
1399; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1400; GFX12-NEXT:    s_endpgm
1401  %canonicalized = call double @llvm.canonicalize.f64(double 0.0)
1402  store double %canonicalized, ptr addrspace(1) %out
1403  ret void
1404}
1405
1406define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) #1 {
1407; GFX678-LABEL: test_fold_canonicalize_n0_f64:
1408; GFX678:       ; %bb.0:
1409; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1410; GFX678-NEXT:    v_mov_b32_e32 v0, 0
1411; GFX678-NEXT:    v_bfrev_b32_e32 v1, 1
1412; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
1413; GFX678-NEXT:    v_mov_b32_e32 v3, s1
1414; GFX678-NEXT:    v_mov_b32_e32 v2, s0
1415; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1416; GFX678-NEXT:    s_endpgm
1417;
1418; GFX9-LABEL: test_fold_canonicalize_n0_f64:
1419; GFX9:       ; %bb.0:
1420; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1421; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1422; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1423; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1424; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
1425; GFX9-NEXT:    s_endpgm
1426;
1427; GFX11-LABEL: test_fold_canonicalize_n0_f64:
1428; GFX11:       ; %bb.0:
1429; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1430; GFX11-NEXT:    v_mov_b32_e32 v0, 0
1431; GFX11-NEXT:    v_bfrev_b32_e32 v1, 1
1432; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1433; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1434; GFX11-NEXT:    s_endpgm
1435;
1436; GFX12-LABEL: test_fold_canonicalize_n0_f64:
1437; GFX12:       ; %bb.0:
1438; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1439; GFX12-NEXT:    v_mov_b32_e32 v0, 0
1440; GFX12-NEXT:    v_bfrev_b32_e32 v1, 1
1441; GFX12-NEXT:    s_wait_kmcnt 0x0
1442; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1443; GFX12-NEXT:    s_endpgm
1444  %canonicalized = call double @llvm.canonicalize.f64(double -0.0)
1445  store double %canonicalized, ptr addrspace(1) %out
1446  ret void
1447}
1448
1449define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) #1 {
1450; GFX678-LABEL: test_fold_canonicalize_p1_f64:
1451; GFX678:       ; %bb.0:
1452; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1453; GFX678-NEXT:    v_mov_b32_e32 v0, 0
1454; GFX678-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
1455; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
1456; GFX678-NEXT:    v_mov_b32_e32 v3, s1
1457; GFX678-NEXT:    v_mov_b32_e32 v2, s0
1458; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1459; GFX678-NEXT:    s_endpgm
1460;
1461; GFX9-LABEL: test_fold_canonicalize_p1_f64:
1462; GFX9:       ; %bb.0:
1463; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1464; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1465; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
1466; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1467; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
1468; GFX9-NEXT:    s_endpgm
1469;
1470; GFX11-LABEL: test_fold_canonicalize_p1_f64:
1471; GFX11:       ; %bb.0:
1472; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1473; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000
1474; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1475; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1476; GFX11-NEXT:    s_endpgm
1477;
1478; GFX12-LABEL: test_fold_canonicalize_p1_f64:
1479; GFX12:       ; %bb.0:
1480; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1481; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000
1482; GFX12-NEXT:    s_wait_kmcnt 0x0
1483; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1484; GFX12-NEXT:    s_endpgm
1485  %canonicalized = call double @llvm.canonicalize.f64(double 1.0)
1486  store double %canonicalized, ptr addrspace(1) %out
1487  ret void
1488}
1489
1490define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) #1 {
1491; GFX678-LABEL: test_fold_canonicalize_n1_f64:
1492; GFX678:       ; %bb.0:
1493; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1494; GFX678-NEXT:    v_mov_b32_e32 v0, 0
1495; GFX678-NEXT:    v_mov_b32_e32 v1, 0xbff00000
1496; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
1497; GFX678-NEXT:    v_mov_b32_e32 v3, s1
1498; GFX678-NEXT:    v_mov_b32_e32 v2, s0
1499; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1500; GFX678-NEXT:    s_endpgm
1501;
1502; GFX9-LABEL: test_fold_canonicalize_n1_f64:
1503; GFX9:       ; %bb.0:
1504; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1505; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1506; GFX9-NEXT:    v_mov_b32_e32 v1, 0xbff00000
1507; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1508; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
1509; GFX9-NEXT:    s_endpgm
1510;
1511; GFX11-LABEL: test_fold_canonicalize_n1_f64:
1512; GFX11:       ; %bb.0:
1513; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1514; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000
1515; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1516; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1517; GFX11-NEXT:    s_endpgm
1518;
1519; GFX12-LABEL: test_fold_canonicalize_n1_f64:
1520; GFX12:       ; %bb.0:
1521; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1522; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000
1523; GFX12-NEXT:    s_wait_kmcnt 0x0
1524; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1525; GFX12-NEXT:    s_endpgm
1526  %canonicalized = call double @llvm.canonicalize.f64(double -1.0)
1527  store double %canonicalized, ptr addrspace(1) %out
1528  ret void
1529}
1530
1531define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %out) #1 {
1532; GFX678-LABEL: test_fold_canonicalize_literal_f64:
1533; GFX678:       ; %bb.0:
1534; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1535; GFX678-NEXT:    v_mov_b32_e32 v0, 0
1536; GFX678-NEXT:    v_mov_b32_e32 v1, 0x40300000
1537; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
1538; GFX678-NEXT:    v_mov_b32_e32 v3, s1
1539; GFX678-NEXT:    v_mov_b32_e32 v2, s0
1540; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1541; GFX678-NEXT:    s_endpgm
1542;
1543; GFX9-LABEL: test_fold_canonicalize_literal_f64:
1544; GFX9:       ; %bb.0:
1545; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1546; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1547; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40300000
1548; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1549; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
1550; GFX9-NEXT:    s_endpgm
1551;
1552; GFX11-LABEL: test_fold_canonicalize_literal_f64:
1553; GFX11:       ; %bb.0:
1554; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1555; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000
1556; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1557; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1558; GFX11-NEXT:    s_endpgm
1559;
1560; GFX12-LABEL: test_fold_canonicalize_literal_f64:
1561; GFX12:       ; %bb.0:
1562; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1563; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000
1564; GFX12-NEXT:    s_wait_kmcnt 0x0
1565; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1566; GFX12-NEXT:    s_endpgm
1567  %canonicalized = call double @llvm.canonicalize.f64(double 16.0)
1568  store double %canonicalized, ptr addrspace(1) %out
1569  ret void
1570}
1571
1572define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr addrspace(1) %out) #2 {
1573; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
1574; GFX678:       ; %bb.0:
1575; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1576; GFX678-NEXT:    v_mov_b32_e32 v0, 0
1577; GFX678-NEXT:    v_mov_b32_e32 v1, v0
1578; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
1579; GFX678-NEXT:    v_mov_b32_e32 v3, s1
1580; GFX678-NEXT:    v_mov_b32_e32 v2, s0
1581; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1582; GFX678-NEXT:    s_endpgm
1583;
1584; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
1585; GFX9:       ; %bb.0:
1586; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1587; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1588; GFX9-NEXT:    v_mov_b32_e32 v1, v0
1589; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1590; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
1591; GFX9-NEXT:    s_endpgm
1592;
1593; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
1594; GFX11:       ; %bb.0:
1595; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1596; GFX11-NEXT:    v_mov_b32_e32 v0, 0
1597; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1598; GFX11-NEXT:    v_mov_b32_e32 v1, v0
1599; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1600; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1601; GFX11-NEXT:    s_endpgm
1602;
1603; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
1604; GFX12:       ; %bb.0:
1605; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1606; GFX12-NEXT:    v_mov_b32_e32 v0, 0
1607; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1608; GFX12-NEXT:    v_mov_b32_e32 v1, v0
1609; GFX12-NEXT:    s_wait_kmcnt 0x0
1610; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1611; GFX12-NEXT:    s_endpgm
1612  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
1613  store double %canonicalized, ptr addrspace(1) %out
1614  ret void
1615}
1616
1617define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr addrspace(1) %out) #3 {
1618; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
1619; GFX678:       ; %bb.0:
1620; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1621; GFX678-NEXT:    v_mov_b32_e32 v0, -1
1622; GFX678-NEXT:    v_mov_b32_e32 v1, 0xfffff
1623; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
1624; GFX678-NEXT:    v_mov_b32_e32 v3, s1
1625; GFX678-NEXT:    v_mov_b32_e32 v2, s0
1626; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1627; GFX678-NEXT:    s_endpgm
1628;
1629; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
1630; GFX9:       ; %bb.0:
1631; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1632; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1633; GFX9-NEXT:    v_mov_b32_e32 v0, -1
1634; GFX9-NEXT:    v_mov_b32_e32 v1, 0xfffff
1635; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1636; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1637; GFX9-NEXT:    s_endpgm
1638;
1639; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
1640; GFX11:       ; %bb.0:
1641; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1642; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0xfffff
1643; GFX11-NEXT:    v_mov_b32_e32 v0, -1
1644; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1645; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1646; GFX11-NEXT:    s_endpgm
1647;
1648; GFX12-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
1649; GFX12:       ; %bb.0:
1650; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1651; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0xfffff
1652; GFX12-NEXT:    v_mov_b32_e32 v0, -1
1653; GFX12-NEXT:    s_wait_kmcnt 0x0
1654; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1655; GFX12-NEXT:    s_endpgm
1656  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
1657  store double %canonicalized, ptr addrspace(1) %out
1658  ret void
1659}
1660
1661define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #2 {
1662; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
1663; GFX678:       ; %bb.0:
1664; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1665; GFX678-NEXT:    v_mov_b32_e32 v0, 0
1666; GFX678-NEXT:    v_bfrev_b32_e32 v1, 1
1667; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
1668; GFX678-NEXT:    v_mov_b32_e32 v3, s1
1669; GFX678-NEXT:    v_mov_b32_e32 v2, s0
1670; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1671; GFX678-NEXT:    s_endpgm
1672;
1673; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
1674; GFX9:       ; %bb.0:
1675; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1676; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1677; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1678; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1679; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
1680; GFX9-NEXT:    s_endpgm
1681;
1682; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
1683; GFX11:       ; %bb.0:
1684; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1685; GFX11-NEXT:    v_mov_b32_e32 v0, 0
1686; GFX11-NEXT:    v_bfrev_b32_e32 v1, 1
1687; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1688; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1689; GFX11-NEXT:    s_endpgm
1690;
1691; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
1692; GFX12:       ; %bb.0:
1693; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1694; GFX12-NEXT:    v_mov_b32_e32 v0, 0
1695; GFX12-NEXT:    v_bfrev_b32_e32 v1, 1
1696; GFX12-NEXT:    s_wait_kmcnt 0x0
1697; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1698; GFX12-NEXT:    s_endpgm
1699  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
1700  store double %canonicalized, ptr addrspace(1) %out
1701  ret void
1702}
1703
1704define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #3 {
1705; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
1706; GFX678:       ; %bb.0:
1707; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1708; GFX678-NEXT:    v_mov_b32_e32 v0, -1
1709; GFX678-NEXT:    v_mov_b32_e32 v1, 0x800fffff
1710; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
1711; GFX678-NEXT:    v_mov_b32_e32 v3, s1
1712; GFX678-NEXT:    v_mov_b32_e32 v2, s0
1713; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1714; GFX678-NEXT:    s_endpgm
1715;
1716; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
1717; GFX9:       ; %bb.0:
1718; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1719; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1720; GFX9-NEXT:    v_mov_b32_e32 v0, -1
1721; GFX9-NEXT:    v_mov_b32_e32 v1, 0x800fffff
1722; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1723; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1724; GFX9-NEXT:    s_endpgm
1725;
1726; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
1727; GFX11:       ; %bb.0:
1728; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1729; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x800fffff
1730; GFX11-NEXT:    v_mov_b32_e32 v0, -1
1731; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1732; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1733; GFX11-NEXT:    s_endpgm
1734;
1735; GFX12-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
1736; GFX12:       ; %bb.0:
1737; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1738; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x800fffff
1739; GFX12-NEXT:    v_mov_b32_e32 v0, -1
1740; GFX12-NEXT:    s_wait_kmcnt 0x0
1741; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1742; GFX12-NEXT:    s_endpgm
1743  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
1744  store double %canonicalized, ptr addrspace(1) %out
1745  ret void
1746}
1747
1748define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out) #1 {
1749; GFX678-LABEL: test_fold_canonicalize_qnan_f64:
1750; GFX678:       ; %bb.0:
1751; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1752; GFX678-NEXT:    v_mov_b32_e32 v0, 0
1753; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
1754; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
1755; GFX678-NEXT:    v_mov_b32_e32 v3, s1
1756; GFX678-NEXT:    v_mov_b32_e32 v2, s0
1757; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1758; GFX678-NEXT:    s_endpgm
1759;
1760; GFX9-LABEL: test_fold_canonicalize_qnan_f64:
1761; GFX9:       ; %bb.0:
1762; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1763; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1764; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
1765; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1766; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
1767; GFX9-NEXT:    s_endpgm
1768;
1769; GFX11-LABEL: test_fold_canonicalize_qnan_f64:
1770; GFX11:       ; %bb.0:
1771; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1772; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
1773; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1774; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1775; GFX11-NEXT:    s_endpgm
1776;
1777; GFX12-LABEL: test_fold_canonicalize_qnan_f64:
1778; GFX12:       ; %bb.0:
1779; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1780; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
1781; GFX12-NEXT:    s_wait_kmcnt 0x0
1782; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1783; GFX12-NEXT:    s_endpgm
1784  %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000)
1785  store double %canonicalized, ptr addrspace(1) %out
1786  ret void
1787}
1788
1789define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrspace(1) %out) #1 {
1790; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64:
1791; GFX678:       ; %bb.0:
1792; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1793; GFX678-NEXT:    v_mov_b32_e32 v0, 0
1794; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
1795; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
1796; GFX678-NEXT:    v_mov_b32_e32 v3, s1
1797; GFX678-NEXT:    v_mov_b32_e32 v2, s0
1798; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1799; GFX678-NEXT:    s_endpgm
1800;
1801; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f64:
1802; GFX9:       ; %bb.0:
1803; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1804; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1805; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
1806; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1807; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
1808; GFX9-NEXT:    s_endpgm
1809;
1810; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f64:
1811; GFX11:       ; %bb.0:
1812; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1813; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
1814; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1815; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1816; GFX11-NEXT:    s_endpgm
1817;
1818; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg1_f64:
1819; GFX12:       ; %bb.0:
1820; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1821; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
1822; GFX12-NEXT:    s_wait_kmcnt 0x0
1823; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1824; GFX12-NEXT:    s_endpgm
1825  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double))
1826  store double %canonicalized, ptr addrspace(1) %out
1827  ret void
1828}
1829
1830define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrspace(1) %out) #1 {
1831; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64:
1832; GFX678:       ; %bb.0:
1833; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1834; GFX678-NEXT:    v_mov_b32_e32 v0, 0
1835; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
1836; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
1837; GFX678-NEXT:    v_mov_b32_e32 v3, s1
1838; GFX678-NEXT:    v_mov_b32_e32 v2, s0
1839; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1840; GFX678-NEXT:    s_endpgm
1841;
1842; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f64:
1843; GFX9:       ; %bb.0:
1844; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1845; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1846; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
1847; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1848; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
1849; GFX9-NEXT:    s_endpgm
1850;
1851; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f64:
1852; GFX11:       ; %bb.0:
1853; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1854; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
1855; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1856; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1857; GFX11-NEXT:    s_endpgm
1858;
1859; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg2_f64:
1860; GFX12:       ; %bb.0:
1861; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1862; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
1863; GFX12-NEXT:    s_wait_kmcnt 0x0
1864; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1865; GFX12-NEXT:    s_endpgm
1866  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double))
1867  store double %canonicalized, ptr addrspace(1) %out
1868  ret void
1869}
1870
1871define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(1) %out) #1 {
1872; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64:
1873; GFX678:       ; %bb.0:
1874; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1875; GFX678-NEXT:    v_mov_b32_e32 v0, 0
1876; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
1877; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
1878; GFX678-NEXT:    v_mov_b32_e32 v3, s1
1879; GFX678-NEXT:    v_mov_b32_e32 v2, s0
1880; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1881; GFX678-NEXT:    s_endpgm
1882;
1883; GFX9-LABEL: test_fold_canonicalize_snan0_value_f64:
1884; GFX9:       ; %bb.0:
1885; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1886; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1887; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
1888; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1889; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
1890; GFX9-NEXT:    s_endpgm
1891;
1892; GFX11-LABEL: test_fold_canonicalize_snan0_value_f64:
1893; GFX11:       ; %bb.0:
1894; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1895; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
1896; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1897; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1898; GFX11-NEXT:    s_endpgm
1899;
1900; GFX12-LABEL: test_fold_canonicalize_snan0_value_f64:
1901; GFX12:       ; %bb.0:
1902; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1903; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
1904; GFX12-NEXT:    s_wait_kmcnt 0x0
1905; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1906; GFX12-NEXT:    s_endpgm
1907  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double))
1908  store double %canonicalized, ptr addrspace(1) %out
1909  ret void
1910}
1911
1912define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(1) %out) #1 {
1913; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64:
1914; GFX678:       ; %bb.0:
1915; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1916; GFX678-NEXT:    v_mov_b32_e32 v0, 0
1917; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
1918; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
1919; GFX678-NEXT:    v_mov_b32_e32 v3, s1
1920; GFX678-NEXT:    v_mov_b32_e32 v2, s0
1921; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1922; GFX678-NEXT:    s_endpgm
1923;
1924; GFX9-LABEL: test_fold_canonicalize_snan1_value_f64:
1925; GFX9:       ; %bb.0:
1926; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1927; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1928; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
1929; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1930; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
1931; GFX9-NEXT:    s_endpgm
1932;
1933; GFX11-LABEL: test_fold_canonicalize_snan1_value_f64:
1934; GFX11:       ; %bb.0:
1935; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1936; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
1937; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1938; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1939; GFX11-NEXT:    s_endpgm
1940;
1941; GFX12-LABEL: test_fold_canonicalize_snan1_value_f64:
1942; GFX12:       ; %bb.0:
1943; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1944; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
1945; GFX12-NEXT:    s_wait_kmcnt 0x0
1946; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1947; GFX12-NEXT:    s_endpgm
1948  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9223372036854775807 to double))
1949  store double %canonicalized, ptr addrspace(1) %out
1950  ret void
1951}
1952
1953define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(1) %out) #1 {
1954; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64:
1955; GFX678:       ; %bb.0:
1956; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1957; GFX678-NEXT:    v_mov_b32_e32 v0, 0
1958; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
1959; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
1960; GFX678-NEXT:    v_mov_b32_e32 v3, s1
1961; GFX678-NEXT:    v_mov_b32_e32 v2, s0
1962; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1963; GFX678-NEXT:    s_endpgm
1964;
1965; GFX9-LABEL: test_fold_canonicalize_snan2_value_f64:
1966; GFX9:       ; %bb.0:
1967; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1968; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1969; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
1970; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1971; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
1972; GFX9-NEXT:    s_endpgm
1973;
1974; GFX11-LABEL: test_fold_canonicalize_snan2_value_f64:
1975; GFX11:       ; %bb.0:
1976; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1977; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
1978; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1979; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1980; GFX11-NEXT:    s_endpgm
1981;
1982; GFX12-LABEL: test_fold_canonicalize_snan2_value_f64:
1983; GFX12:       ; %bb.0:
1984; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
1985; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
1986; GFX12-NEXT:    s_wait_kmcnt 0x0
1987; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
1988; GFX12-NEXT:    s_endpgm
1989  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18442240474082181121 to double))
1990  store double %canonicalized, ptr addrspace(1) %out
1991  ret void
1992}
1993
1994define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(1) %out) #1 {
1995; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64:
1996; GFX678:       ; %bb.0:
1997; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1998; GFX678-NEXT:    v_mov_b32_e32 v0, 0
1999; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
2000; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
2001; GFX678-NEXT:    v_mov_b32_e32 v3, s1
2002; GFX678-NEXT:    v_mov_b32_e32 v2, s0
2003; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2004; GFX678-NEXT:    s_endpgm
2005;
2006; GFX9-LABEL: test_fold_canonicalize_snan3_value_f64:
2007; GFX9:       ; %bb.0:
2008; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
2009; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2010; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
2011; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2012; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
2013; GFX9-NEXT:    s_endpgm
2014;
2015; GFX11-LABEL: test_fold_canonicalize_snan3_value_f64:
2016; GFX11:       ; %bb.0:
2017; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2018; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
2019; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2020; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
2021; GFX11-NEXT:    s_endpgm
2022;
2023; GFX12-LABEL: test_fold_canonicalize_snan3_value_f64:
2024; GFX12:       ; %bb.0:
2025; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2026; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000
2027; GFX12-NEXT:    s_wait_kmcnt 0x0
2028; GFX12-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
2029; GFX12-NEXT:    s_endpgm
2030  %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18446744073709551615 to double))
2031  store double %canonicalized, ptr addrspace(1) %out
2032  ret void
2033}
2034
2035define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 {
2036; GFX6-LABEL: test_canonicalize_value_f64_flush:
2037; GFX6:       ; %bb.0:
2038; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2039; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
2040; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2041; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2042; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
2043; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2044; GFX6-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
2045; GFX6-NEXT:    v_mov_b32_e32 v3, s3
2046; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s2, v2
2047; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2048; GFX6-NEXT:    s_waitcnt vmcnt(0)
2049; GFX6-NEXT:    v_mul_f64 v[0:1], 1.0, v[0:1]
2050; GFX6-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2051; GFX6-NEXT:    s_endpgm
2052;
2053; GFX8-LABEL: test_canonicalize_value_f64_flush:
2054; GFX8:       ; %bb.0:
2055; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2056; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
2057; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2058; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2059; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2060; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2061; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
2062; GFX8-NEXT:    v_mov_b32_e32 v3, s3
2063; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
2064; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2065; GFX8-NEXT:    s_waitcnt vmcnt(0)
2066; GFX8-NEXT:    v_mul_f64 v[0:1], 1.0, v[0:1]
2067; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2068; GFX8-NEXT:    s_endpgm
2069;
2070; GFX9-LABEL: test_canonicalize_value_f64_flush:
2071; GFX9:       ; %bb.0:
2072; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2073; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
2074; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2075; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
2076; GFX9-NEXT:    s_waitcnt vmcnt(0)
2077; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
2078; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
2079; GFX9-NEXT:    s_endpgm
2080;
2081; GFX11-LABEL: test_canonicalize_value_f64_flush:
2082; GFX11:       ; %bb.0:
2083; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2084; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2085; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2086; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
2087; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2088; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[0:1]
2089; GFX11-NEXT:    s_waitcnt vmcnt(0)
2090; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
2091; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
2092; GFX11-NEXT:    s_endpgm
2093;
2094; GFX12-LABEL: test_canonicalize_value_f64_flush:
2095; GFX12:       ; %bb.0:
2096; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2097; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2098; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2099; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
2100; GFX12-NEXT:    s_wait_kmcnt 0x0
2101; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1]
2102; GFX12-NEXT:    s_wait_loadcnt 0x0
2103; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
2104; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
2105; GFX12-NEXT:    s_endpgm
2106  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
2107  %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
2108  %v = load double, ptr addrspace(1) %gep, align 8
2109  %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
2110  %gep2 = getelementptr inbounds double, ptr addrspace(1) %out, i32 %id
2111  store double %canonicalized, ptr addrspace(1) %gep2, align 8
2112  ret void
2113}
2114
2115define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 {
2116; GFX6-LABEL: test_canonicalize_value_f32_flush:
2117; GFX6:       ; %bb.0:
2118; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2119; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2120; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2121; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2122; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
2123; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2124; GFX6-NEXT:    flat_load_dword v0, v[0:1]
2125; GFX6-NEXT:    v_mov_b32_e32 v1, s3
2126; GFX6-NEXT:    s_waitcnt vmcnt(0)
2127; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v0
2128; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
2129; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2130; GFX6-NEXT:    flat_store_dword v[0:1], v3
2131; GFX6-NEXT:    s_endpgm
2132;
2133; GFX8-LABEL: test_canonicalize_value_f32_flush:
2134; GFX8:       ; %bb.0:
2135; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2136; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2137; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2138; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2139; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2140; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2141; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2142; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2143; GFX8-NEXT:    s_waitcnt vmcnt(0)
2144; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v0
2145; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2146; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2147; GFX8-NEXT:    flat_store_dword v[0:1], v3
2148; GFX8-NEXT:    s_endpgm
2149;
2150; GFX9-LABEL: test_canonicalize_value_f32_flush:
2151; GFX9:       ; %bb.0:
2152; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2153; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2154; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2155; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
2156; GFX9-NEXT:    s_waitcnt vmcnt(0)
2157; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
2158; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
2159; GFX9-NEXT:    s_endpgm
2160;
2161; GFX11-LABEL: test_canonicalize_value_f32_flush:
2162; GFX11:       ; %bb.0:
2163; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2164; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2165; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2166; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2167; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2168; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1]
2169; GFX11-NEXT:    s_waitcnt vmcnt(0)
2170; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
2171; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
2172; GFX11-NEXT:    s_endpgm
2173;
2174; GFX12-LABEL: test_canonicalize_value_f32_flush:
2175; GFX12:       ; %bb.0:
2176; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2177; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2178; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2179; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2180; GFX12-NEXT:    s_wait_kmcnt 0x0
2181; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
2182; GFX12-NEXT:    s_wait_loadcnt 0x0
2183; GFX12-NEXT:    v_max_num_f32_e32 v1, v1, v1
2184; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
2185; GFX12-NEXT:    s_endpgm
2186  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
2187  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
2188  %v = load float, ptr addrspace(1) %gep, align 4
2189  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
2190  %gep2 = getelementptr inbounds float, ptr addrspace(1) %out, i32 %id
2191  store float %canonicalized, ptr addrspace(1) %gep2, align 4
2192  ret void
2193}
2194
2195define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 {
2196; GFX6-LABEL: test_canonicalize_value_f16_flush:
2197; GFX6:       ; %bb.0:
2198; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2199; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
2200; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2201; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2202; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
2203; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2204; GFX6-NEXT:    flat_load_ushort v0, v[0:1]
2205; GFX6-NEXT:    v_mov_b32_e32 v1, s3
2206; GFX6-NEXT:    s_waitcnt vmcnt(0)
2207; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
2208; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v0
2209; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
2210; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2211; GFX6-NEXT:    flat_store_short v[0:1], v3
2212; GFX6-NEXT:    s_endpgm
2213;
2214; GFX8-LABEL: test_canonicalize_value_f16_flush:
2215; GFX8:       ; %bb.0:
2216; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2217; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
2218; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2219; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2220; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2221; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2222; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
2223; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2224; GFX8-NEXT:    s_waitcnt vmcnt(0)
2225; GFX8-NEXT:    v_mul_f16_e32 v3, 1.0, v0
2226; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2227; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2228; GFX8-NEXT:    flat_store_short v[0:1], v3
2229; GFX8-NEXT:    s_endpgm
2230;
2231; GFX9-LABEL: test_canonicalize_value_f16_flush:
2232; GFX9:       ; %bb.0:
2233; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2234; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2235; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2236; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1]
2237; GFX9-NEXT:    s_waitcnt vmcnt(0)
2238; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
2239; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
2240; GFX9-NEXT:    s_endpgm
2241;
2242; GFX11-LABEL: test_canonicalize_value_f16_flush:
2243; GFX11:       ; %bb.0:
2244; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2245; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2246; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2247; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2248; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2249; GFX11-NEXT:    global_load_u16 v1, v0, s[0:1]
2250; GFX11-NEXT:    s_waitcnt vmcnt(0)
2251; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
2252; GFX11-NEXT:    global_store_b16 v0, v1, s[2:3]
2253; GFX11-NEXT:    s_endpgm
2254;
2255; GFX12-LABEL: test_canonicalize_value_f16_flush:
2256; GFX12:       ; %bb.0:
2257; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2258; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2259; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2260; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2261; GFX12-NEXT:    s_wait_kmcnt 0x0
2262; GFX12-NEXT:    global_load_u16 v1, v0, s[0:1]
2263; GFX12-NEXT:    s_wait_loadcnt 0x0
2264; GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
2265; GFX12-NEXT:    global_store_b16 v0, v1, s[2:3]
2266; GFX12-NEXT:    s_endpgm
2267  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
2268  %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
2269  %v = load half, ptr addrspace(1) %gep, align 2
2270  %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
2271  %gep2 = getelementptr inbounds half, ptr addrspace(1) %out, i32 %id
2272  store half %canonicalized, ptr addrspace(1) %gep2, align 2
2273  ret void
2274}
2275
2276
2277define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 {
2278; GFX6-LABEL: test_canonicalize_value_v2f16_flush:
2279; GFX6:       ; %bb.0:
2280; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2281; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2282; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2283; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2284; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
2285; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2286; GFX6-NEXT:    flat_load_dword v0, v[0:1]
2287; GFX6-NEXT:    v_mov_b32_e32 v3, s3
2288; GFX6-NEXT:    s_waitcnt vmcnt(0)
2289; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
2290; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
2291; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
2292; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
2293; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
2294; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2295; GFX6-NEXT:    v_or_b32_e32 v4, v0, v1
2296; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
2297; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
2298; GFX6-NEXT:    flat_store_dword v[0:1], v4
2299; GFX6-NEXT:    s_endpgm
2300;
2301; GFX8-LABEL: test_canonicalize_value_v2f16_flush:
2302; GFX8:       ; %bb.0:
2303; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2304; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2305; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2306; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2307; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2308; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2309; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2310; GFX8-NEXT:    v_mov_b32_e32 v1, 0x3c00
2311; GFX8-NEXT:    v_mov_b32_e32 v3, s3
2312; GFX8-NEXT:    s_waitcnt vmcnt(0)
2313; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2314; GFX8-NEXT:    v_mul_f16_e32 v0, 1.0, v0
2315; GFX8-NEXT:    v_or_b32_e32 v4, v0, v1
2316; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2317; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
2318; GFX8-NEXT:    flat_store_dword v[0:1], v4
2319; GFX8-NEXT:    s_endpgm
2320;
2321; GFX9-LABEL: test_canonicalize_value_v2f16_flush:
2322; GFX9:       ; %bb.0:
2323; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2324; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2325; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2326; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
2327; GFX9-NEXT:    s_waitcnt vmcnt(0)
2328; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
2329; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
2330; GFX9-NEXT:    s_endpgm
2331;
2332; GFX11-LABEL: test_canonicalize_value_v2f16_flush:
2333; GFX11:       ; %bb.0:
2334; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2335; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2336; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2337; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2338; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2339; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1]
2340; GFX11-NEXT:    s_waitcnt vmcnt(0)
2341; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
2342; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
2343; GFX11-NEXT:    s_endpgm
2344;
2345; GFX12-LABEL: test_canonicalize_value_v2f16_flush:
2346; GFX12:       ; %bb.0:
2347; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2348; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2349; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2350; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2351; GFX12-NEXT:    s_wait_kmcnt 0x0
2352; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
2353; GFX12-NEXT:    s_wait_loadcnt 0x0
2354; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
2355; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
2356; GFX12-NEXT:    s_endpgm
2357  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
2358  %gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i32 %id
2359  %v = load <2 x half>, ptr addrspace(1) %gep, align 4
2360  %canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v)
2361  %gep2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i32 %id
2362  store <2 x half> %canonicalized, ptr addrspace(1) %gep2, align 2
2363  ret void
2364}
2365
2366define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 {
2367; GFX6-LABEL: test_canonicalize_value_f64_denorm:
2368; GFX6:       ; %bb.0:
2369; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2370; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
2371; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2372; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2373; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
2374; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2375; GFX6-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
2376; GFX6-NEXT:    v_mov_b32_e32 v3, s3
2377; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s2, v2
2378; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2379; GFX6-NEXT:    s_waitcnt vmcnt(0)
2380; GFX6-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
2381; GFX6-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2382; GFX6-NEXT:    s_endpgm
2383;
2384; GFX8-LABEL: test_canonicalize_value_f64_denorm:
2385; GFX8:       ; %bb.0:
2386; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2387; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
2388; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2389; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2390; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2391; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2392; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
2393; GFX8-NEXT:    v_mov_b32_e32 v3, s3
2394; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
2395; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2396; GFX8-NEXT:    s_waitcnt vmcnt(0)
2397; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
2398; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2399; GFX8-NEXT:    s_endpgm
2400;
2401; GFX9-LABEL: test_canonicalize_value_f64_denorm:
2402; GFX9:       ; %bb.0:
2403; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2404; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
2405; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2406; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
2407; GFX9-NEXT:    s_waitcnt vmcnt(0)
2408; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
2409; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
2410; GFX9-NEXT:    s_endpgm
2411;
2412; GFX11-LABEL: test_canonicalize_value_f64_denorm:
2413; GFX11:       ; %bb.0:
2414; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2415; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2416; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2417; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
2418; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2419; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[0:1]
2420; GFX11-NEXT:    s_waitcnt vmcnt(0)
2421; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
2422; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
2423; GFX11-NEXT:    s_endpgm
2424;
2425; GFX12-LABEL: test_canonicalize_value_f64_denorm:
2426; GFX12:       ; %bb.0:
2427; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2428; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2429; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2430; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
2431; GFX12-NEXT:    s_wait_kmcnt 0x0
2432; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[0:1]
2433; GFX12-NEXT:    s_wait_loadcnt 0x0
2434; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
2435; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
2436; GFX12-NEXT:    s_endpgm
2437  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
2438  %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
2439  %v = load double, ptr addrspace(1) %gep, align 8
2440  %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
2441  %gep2 = getelementptr inbounds double, ptr addrspace(1) %out, i32 %id
2442  store double %canonicalized, ptr addrspace(1) %gep2, align 8
2443  ret void
2444}
2445
2446define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 {
2447; GFX6-LABEL: test_canonicalize_value_f32_denorm:
2448; GFX6:       ; %bb.0:
2449; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2450; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2451; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2452; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2453; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
2454; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2455; GFX6-NEXT:    flat_load_dword v0, v[0:1]
2456; GFX6-NEXT:    v_mov_b32_e32 v1, s3
2457; GFX6-NEXT:    s_waitcnt vmcnt(0)
2458; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v0
2459; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
2460; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2461; GFX6-NEXT:    flat_store_dword v[0:1], v3
2462; GFX6-NEXT:    s_endpgm
2463;
2464; GFX8-LABEL: test_canonicalize_value_f32_denorm:
2465; GFX8:       ; %bb.0:
2466; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2467; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2468; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2469; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2470; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2471; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2472; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2473; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2474; GFX8-NEXT:    s_waitcnt vmcnt(0)
2475; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v0
2476; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2477; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2478; GFX8-NEXT:    flat_store_dword v[0:1], v3
2479; GFX8-NEXT:    s_endpgm
2480;
2481; GFX9-LABEL: test_canonicalize_value_f32_denorm:
2482; GFX9:       ; %bb.0:
2483; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2484; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2485; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2486; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
2487; GFX9-NEXT:    s_waitcnt vmcnt(0)
2488; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
2489; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
2490; GFX9-NEXT:    s_endpgm
2491;
2492; GFX11-LABEL: test_canonicalize_value_f32_denorm:
2493; GFX11:       ; %bb.0:
2494; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2495; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2496; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2497; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2498; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2499; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1]
2500; GFX11-NEXT:    s_waitcnt vmcnt(0)
2501; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
2502; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
2503; GFX11-NEXT:    s_endpgm
2504;
2505; GFX12-LABEL: test_canonicalize_value_f32_denorm:
2506; GFX12:       ; %bb.0:
2507; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2508; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2509; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2510; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2511; GFX12-NEXT:    s_wait_kmcnt 0x0
2512; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
2513; GFX12-NEXT:    s_wait_loadcnt 0x0
2514; GFX12-NEXT:    v_max_num_f32_e32 v1, v1, v1
2515; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
2516; GFX12-NEXT:    s_endpgm
2517  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
2518  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
2519  %v = load float, ptr addrspace(1) %gep, align 4
2520  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
2521  %gep2 = getelementptr inbounds float, ptr addrspace(1) %out, i32 %id
2522  store float %canonicalized, ptr addrspace(1) %gep2, align 4
2523  ret void
2524}
2525
2526; FIXME: Conversion to float should count as the canonicalize pre-gfx8
2527define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 {
2528; GFX6-LABEL: test_canonicalize_value_f16_denorm:
2529; GFX6:       ; %bb.0:
2530; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2531; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
2532; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2533; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2534; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
2535; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2536; GFX6-NEXT:    flat_load_ushort v0, v[0:1]
2537; GFX6-NEXT:    v_mov_b32_e32 v1, s3
2538; GFX6-NEXT:    s_waitcnt vmcnt(0)
2539; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
2540; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v0
2541; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
2542; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2543; GFX6-NEXT:    flat_store_short v[0:1], v3
2544; GFX6-NEXT:    s_endpgm
2545;
2546; GFX8-LABEL: test_canonicalize_value_f16_denorm:
2547; GFX8:       ; %bb.0:
2548; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2549; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
2550; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2551; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2552; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2553; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2554; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
2555; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2556; GFX8-NEXT:    s_waitcnt vmcnt(0)
2557; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
2558; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2559; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2560; GFX8-NEXT:    flat_store_short v[0:1], v3
2561; GFX8-NEXT:    s_endpgm
2562;
2563; GFX9-LABEL: test_canonicalize_value_f16_denorm:
2564; GFX9:       ; %bb.0:
2565; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2566; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2567; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2568; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1]
2569; GFX9-NEXT:    s_waitcnt vmcnt(0)
2570; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
2571; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
2572; GFX9-NEXT:    s_endpgm
2573;
2574; GFX11-LABEL: test_canonicalize_value_f16_denorm:
2575; GFX11:       ; %bb.0:
2576; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2577; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2578; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2579; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2580; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2581; GFX11-NEXT:    global_load_u16 v1, v0, s[0:1]
2582; GFX11-NEXT:    s_waitcnt vmcnt(0)
2583; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
2584; GFX11-NEXT:    global_store_b16 v0, v1, s[2:3]
2585; GFX11-NEXT:    s_endpgm
2586;
2587; GFX12-LABEL: test_canonicalize_value_f16_denorm:
2588; GFX12:       ; %bb.0:
2589; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2590; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2591; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2592; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2593; GFX12-NEXT:    s_wait_kmcnt 0x0
2594; GFX12-NEXT:    global_load_u16 v1, v0, s[0:1]
2595; GFX12-NEXT:    s_wait_loadcnt 0x0
2596; GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
2597; GFX12-NEXT:    global_store_b16 v0, v1, s[2:3]
2598; GFX12-NEXT:    s_endpgm
2599  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
2600  %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
2601  %v = load half, ptr addrspace(1) %gep, align 2
2602  %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
2603  %gep2 = getelementptr inbounds half, ptr addrspace(1) %out, i32 %id
2604  store half %canonicalized, ptr addrspace(1) %gep2, align 2
2605  ret void
2606}
2607
2608
2609
2610define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 {
2611; GFX6-LABEL: test_canonicalize_value_v2f16_denorm:
2612; GFX6:       ; %bb.0:
2613; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2614; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2615; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2616; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2617; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
2618; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2619; GFX6-NEXT:    flat_load_dword v0, v[0:1]
2620; GFX6-NEXT:    v_mov_b32_e32 v3, s3
2621; GFX6-NEXT:    s_waitcnt vmcnt(0)
2622; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
2623; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
2624; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
2625; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
2626; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
2627; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2628; GFX6-NEXT:    v_or_b32_e32 v4, v0, v1
2629; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
2630; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
2631; GFX6-NEXT:    flat_store_dword v[0:1], v4
2632; GFX6-NEXT:    s_endpgm
2633;
2634; GFX8-LABEL: test_canonicalize_value_v2f16_denorm:
2635; GFX8:       ; %bb.0:
2636; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2637; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2638; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2639; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2640; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2641; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2642; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2643; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2644; GFX8-NEXT:    s_waitcnt vmcnt(0)
2645; GFX8-NEXT:    v_max_f16_sdwa v3, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2646; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
2647; GFX8-NEXT:    v_or_b32_e32 v3, v0, v3
2648; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2649; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2650; GFX8-NEXT:    flat_store_dword v[0:1], v3
2651; GFX8-NEXT:    s_endpgm
2652;
2653; GFX9-LABEL: test_canonicalize_value_v2f16_denorm:
2654; GFX9:       ; %bb.0:
2655; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2656; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2657; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2658; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
2659; GFX9-NEXT:    s_waitcnt vmcnt(0)
2660; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
2661; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
2662; GFX9-NEXT:    s_endpgm
2663;
2664; GFX11-LABEL: test_canonicalize_value_v2f16_denorm:
2665; GFX11:       ; %bb.0:
2666; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2667; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2668; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2669; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2670; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2671; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1]
2672; GFX11-NEXT:    s_waitcnt vmcnt(0)
2673; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
2674; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
2675; GFX11-NEXT:    s_endpgm
2676;
2677; GFX12-LABEL: test_canonicalize_value_v2f16_denorm:
2678; GFX12:       ; %bb.0:
2679; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2680; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2681; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2682; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2683; GFX12-NEXT:    s_wait_kmcnt 0x0
2684; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
2685; GFX12-NEXT:    s_wait_loadcnt 0x0
2686; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v1
2687; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3]
2688; GFX12-NEXT:    s_endpgm
2689  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
2690  %gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i32 %id
2691  %v = load <2 x half>, ptr addrspace(1) %gep, align 4
2692  %canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v)
2693  %gep2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i32 %id
2694  store <2 x half> %canonicalized, ptr addrspace(1) %gep2, align 2
2695  ret void
2696}
2697
2698define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) #1 {
2699; GFX6-LABEL: v_test_canonicalize_var_v2f64:
2700; GFX6:       ; %bb.0:
2701; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
2702; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
2703; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2704; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2705; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
2706; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2707; GFX6-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2708; GFX6-NEXT:    v_mov_b32_e32 v5, s1
2709; GFX6-NEXT:    v_mov_b32_e32 v4, s0
2710; GFX6-NEXT:    s_waitcnt vmcnt(0)
2711; GFX6-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
2712; GFX6-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
2713; GFX6-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2714; GFX6-NEXT:    s_endpgm
2715;
2716; GFX8-LABEL: v_test_canonicalize_var_v2f64:
2717; GFX8:       ; %bb.0:
2718; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
2719; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
2720; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2721; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2722; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
2723; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2724; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2725; GFX8-NEXT:    v_mov_b32_e32 v5, s1
2726; GFX8-NEXT:    v_mov_b32_e32 v4, s0
2727; GFX8-NEXT:    s_waitcnt vmcnt(0)
2728; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
2729; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
2730; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2731; GFX8-NEXT:    s_endpgm
2732;
2733; GFX9-LABEL: v_test_canonicalize_var_v2f64:
2734; GFX9:       ; %bb.0:
2735; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
2736; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
2737; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2738; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2739; GFX9-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
2740; GFX9-NEXT:    s_waitcnt vmcnt(0)
2741; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
2742; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
2743; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2744; GFX9-NEXT:    s_endpgm
2745;
2746; GFX11-LABEL: v_test_canonicalize_var_v2f64:
2747; GFX11:       ; %bb.0:
2748; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2749; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2750; GFX11-NEXT:    v_mov_b32_e32 v4, 0
2751; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2752; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
2753; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2754; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[0:1]
2755; GFX11-NEXT:    s_waitcnt vmcnt(0)
2756; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
2757; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
2758; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
2759; GFX11-NEXT:    s_endpgm
2760;
2761; GFX12-LABEL: v_test_canonicalize_var_v2f64:
2762; GFX12:       ; %bb.0:
2763; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2764; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2765; GFX12-NEXT:    v_mov_b32_e32 v4, 0
2766; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2767; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
2768; GFX12-NEXT:    s_wait_kmcnt 0x0
2769; GFX12-NEXT:    global_load_b128 v[0:3], v0, s[0:1]
2770; GFX12-NEXT:    s_wait_loadcnt 0x0
2771; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
2772; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
2773; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
2774; GFX12-NEXT:    s_endpgm
2775  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2776  %gep = getelementptr <2 x double>, ptr addrspace(1) %out, i32 %tid
2777  %val = load <2 x double>, ptr addrspace(1) %gep
2778  %canonicalized = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> %val)
2779  store <2 x double> %canonicalized, ptr addrspace(1) %out
2780  ret void
2781}
2782
2783
2784define <2 x float> @v_test_canonicalize_v2f32_flush(<2 x float> %arg) #1 {
2785; GFX678-LABEL: v_test_canonicalize_v2f32_flush:
2786; GFX678:       ; %bb.0:
2787; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2788; GFX678-NEXT:    v_mul_f32_e32 v0, 1.0, v0
2789; GFX678-NEXT:    v_mul_f32_e32 v1, 1.0, v1
2790; GFX678-NEXT:    s_setpc_b64 s[30:31]
2791;
2792; GFX9-LABEL: v_test_canonicalize_v2f32_flush:
2793; GFX9:       ; %bb.0:
2794; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2795; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
2796; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
2797; GFX9-NEXT:    s_setpc_b64 s[30:31]
2798;
2799; GFX11-LABEL: v_test_canonicalize_v2f32_flush:
2800; GFX11:       ; %bb.0:
2801; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2802; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
2803; GFX11-NEXT:    s_setpc_b64 s[30:31]
2804;
2805; GFX12-LABEL: v_test_canonicalize_v2f32_flush:
2806; GFX12:       ; %bb.0:
2807; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2808; GFX12-NEXT:    s_wait_expcnt 0x0
2809; GFX12-NEXT:    s_wait_samplecnt 0x0
2810; GFX12-NEXT:    s_wait_bvhcnt 0x0
2811; GFX12-NEXT:    s_wait_kmcnt 0x0
2812; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
2813; GFX12-NEXT:    s_setpc_b64 s[30:31]
2814  %canon = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> %arg)
2815  ret <2 x float> %canon
2816}
2817
2818
2819define <3 x float> @v_test_canonicalize_v3f32_flush(<3 x float> %arg) #1 {
2820; GFX678-LABEL: v_test_canonicalize_v3f32_flush:
2821; GFX678:       ; %bb.0:
2822; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2823; GFX678-NEXT:    v_mul_f32_e32 v0, 1.0, v0
2824; GFX678-NEXT:    v_mul_f32_e32 v1, 1.0, v1
2825; GFX678-NEXT:    v_mul_f32_e32 v2, 1.0, v2
2826; GFX678-NEXT:    s_setpc_b64 s[30:31]
2827;
2828; GFX9-LABEL: v_test_canonicalize_v3f32_flush:
2829; GFX9:       ; %bb.0:
2830; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2831; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
2832; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
2833; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
2834; GFX9-NEXT:    s_setpc_b64 s[30:31]
2835;
2836; GFX11-LABEL: v_test_canonicalize_v3f32_flush:
2837; GFX11:       ; %bb.0:
2838; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2839; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
2840; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
2841; GFX11-NEXT:    s_setpc_b64 s[30:31]
2842;
2843; GFX12-LABEL: v_test_canonicalize_v3f32_flush:
2844; GFX12:       ; %bb.0:
2845; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2846; GFX12-NEXT:    s_wait_expcnt 0x0
2847; GFX12-NEXT:    s_wait_samplecnt 0x0
2848; GFX12-NEXT:    s_wait_bvhcnt 0x0
2849; GFX12-NEXT:    s_wait_kmcnt 0x0
2850; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
2851; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
2852; GFX12-NEXT:    s_setpc_b64 s[30:31]
2853  %canon = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> %arg)
2854  ret <3 x float> %canon
2855}
2856
2857
2858define <4 x float> @v_test_canonicalize_v4f32_flush(<4 x float> %arg) #1 {
2859; GFX678-LABEL: v_test_canonicalize_v4f32_flush:
2860; GFX678:       ; %bb.0:
2861; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2862; GFX678-NEXT:    v_mul_f32_e32 v0, 1.0, v0
2863; GFX678-NEXT:    v_mul_f32_e32 v1, 1.0, v1
2864; GFX678-NEXT:    v_mul_f32_e32 v2, 1.0, v2
2865; GFX678-NEXT:    v_mul_f32_e32 v3, 1.0, v3
2866; GFX678-NEXT:    s_setpc_b64 s[30:31]
2867;
2868; GFX9-LABEL: v_test_canonicalize_v4f32_flush:
2869; GFX9:       ; %bb.0:
2870; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2871; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
2872; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
2873; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
2874; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
2875; GFX9-NEXT:    s_setpc_b64 s[30:31]
2876;
2877; GFX11-LABEL: v_test_canonicalize_v4f32_flush:
2878; GFX11:       ; %bb.0:
2879; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2880; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
2881; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
2882; GFX11-NEXT:    s_setpc_b64 s[30:31]
2883;
2884; GFX12-LABEL: v_test_canonicalize_v4f32_flush:
2885; GFX12:       ; %bb.0:
2886; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2887; GFX12-NEXT:    s_wait_expcnt 0x0
2888; GFX12-NEXT:    s_wait_samplecnt 0x0
2889; GFX12-NEXT:    s_wait_bvhcnt 0x0
2890; GFX12-NEXT:    s_wait_kmcnt 0x0
2891; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
2892; GFX12-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
2893; GFX12-NEXT:    s_setpc_b64 s[30:31]
2894  %canon = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %arg)
2895  ret <4 x float> %canon
2896}
2897
2898
2899define <8 x float> @v_test_canonicalize_v8f32_flush(<8 x float> %arg) #1 {
2900; GFX678-LABEL: v_test_canonicalize_v8f32_flush:
2901; GFX678:       ; %bb.0:
2902; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2903; GFX678-NEXT:    v_mul_f32_e32 v0, 1.0, v0
2904; GFX678-NEXT:    v_mul_f32_e32 v1, 1.0, v1
2905; GFX678-NEXT:    v_mul_f32_e32 v2, 1.0, v2
2906; GFX678-NEXT:    v_mul_f32_e32 v3, 1.0, v3
2907; GFX678-NEXT:    v_mul_f32_e32 v4, 1.0, v4
2908; GFX678-NEXT:    v_mul_f32_e32 v5, 1.0, v5
2909; GFX678-NEXT:    v_mul_f32_e32 v6, 1.0, v6
2910; GFX678-NEXT:    v_mul_f32_e32 v7, 1.0, v7
2911; GFX678-NEXT:    s_setpc_b64 s[30:31]
2912;
2913; GFX9-LABEL: v_test_canonicalize_v8f32_flush:
2914; GFX9:       ; %bb.0:
2915; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2916; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
2917; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
2918; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
2919; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
2920; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
2921; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
2922; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
2923; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
2924; GFX9-NEXT:    s_setpc_b64 s[30:31]
2925;
2926; GFX11-LABEL: v_test_canonicalize_v8f32_flush:
2927; GFX11:       ; %bb.0:
2928; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2929; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
2930; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
2931; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5
2932; GFX11-NEXT:    v_dual_max_f32 v6, v6, v6 :: v_dual_max_f32 v7, v7, v7
2933; GFX11-NEXT:    s_setpc_b64 s[30:31]
2934;
2935; GFX12-LABEL: v_test_canonicalize_v8f32_flush:
2936; GFX12:       ; %bb.0:
2937; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2938; GFX12-NEXT:    s_wait_expcnt 0x0
2939; GFX12-NEXT:    s_wait_samplecnt 0x0
2940; GFX12-NEXT:    s_wait_bvhcnt 0x0
2941; GFX12-NEXT:    s_wait_kmcnt 0x0
2942; GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
2943; GFX12-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
2944; GFX12-NEXT:    v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5
2945; GFX12-NEXT:    v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v7, v7, v7
2946; GFX12-NEXT:    s_setpc_b64 s[30:31]
2947  %canon = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> %arg)
2948  ret <8 x float> %canon
2949}
2950
2951define <2 x double> @v_test_canonicalize_v2f64(<2 x double> %arg) #1 {
2952; GFX678-LABEL: v_test_canonicalize_v2f64:
2953; GFX678:       ; %bb.0:
2954; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2955; GFX678-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
2956; GFX678-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
2957; GFX678-NEXT:    s_setpc_b64 s[30:31]
2958;
2959; GFX9-LABEL: v_test_canonicalize_v2f64:
2960; GFX9:       ; %bb.0:
2961; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2962; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
2963; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
2964; GFX9-NEXT:    s_setpc_b64 s[30:31]
2965;
2966; GFX11-LABEL: v_test_canonicalize_v2f64:
2967; GFX11:       ; %bb.0:
2968; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2969; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
2970; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
2971; GFX11-NEXT:    s_setpc_b64 s[30:31]
2972;
2973; GFX12-LABEL: v_test_canonicalize_v2f64:
2974; GFX12:       ; %bb.0:
2975; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2976; GFX12-NEXT:    s_wait_expcnt 0x0
2977; GFX12-NEXT:    s_wait_samplecnt 0x0
2978; GFX12-NEXT:    s_wait_bvhcnt 0x0
2979; GFX12-NEXT:    s_wait_kmcnt 0x0
2980; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
2981; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
2982; GFX12-NEXT:    s_setpc_b64 s[30:31]
2983  %canon = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> %arg)
2984  ret <2 x double> %canon
2985}
2986
2987define <3 x double> @v_test_canonicalize_v3f64(<3 x double> %arg) #1 {
2988; GFX678-LABEL: v_test_canonicalize_v3f64:
2989; GFX678:       ; %bb.0:
2990; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2991; GFX678-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
2992; GFX678-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
2993; GFX678-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
2994; GFX678-NEXT:    s_setpc_b64 s[30:31]
2995;
2996; GFX9-LABEL: v_test_canonicalize_v3f64:
2997; GFX9:       ; %bb.0:
2998; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2999; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
3000; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
3001; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
3002; GFX9-NEXT:    s_setpc_b64 s[30:31]
3003;
3004; GFX11-LABEL: v_test_canonicalize_v3f64:
3005; GFX11:       ; %bb.0:
3006; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3007; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
3008; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
3009; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
3010; GFX11-NEXT:    s_setpc_b64 s[30:31]
3011;
3012; GFX12-LABEL: v_test_canonicalize_v3f64:
3013; GFX12:       ; %bb.0:
3014; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3015; GFX12-NEXT:    s_wait_expcnt 0x0
3016; GFX12-NEXT:    s_wait_samplecnt 0x0
3017; GFX12-NEXT:    s_wait_bvhcnt 0x0
3018; GFX12-NEXT:    s_wait_kmcnt 0x0
3019; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
3020; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
3021; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[4:5]
3022; GFX12-NEXT:    s_setpc_b64 s[30:31]
3023  %canon = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> %arg)
3024  ret <3 x double> %canon
3025}
3026
3027define <4 x double> @v_test_canonicalize_v4f64(<4 x double> %arg) #1 {
3028; GFX678-LABEL: v_test_canonicalize_v4f64:
3029; GFX678:       ; %bb.0:
3030; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3031; GFX678-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
3032; GFX678-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
3033; GFX678-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
3034; GFX678-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
3035; GFX678-NEXT:    s_setpc_b64 s[30:31]
3036;
3037; GFX9-LABEL: v_test_canonicalize_v4f64:
3038; GFX9:       ; %bb.0:
3039; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3040; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
3041; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
3042; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
3043; GFX9-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
3044; GFX9-NEXT:    s_setpc_b64 s[30:31]
3045;
3046; GFX11-LABEL: v_test_canonicalize_v4f64:
3047; GFX11:       ; %bb.0:
3048; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3049; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
3050; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
3051; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
3052; GFX11-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
3053; GFX11-NEXT:    s_setpc_b64 s[30:31]
3054;
3055; GFX12-LABEL: v_test_canonicalize_v4f64:
3056; GFX12:       ; %bb.0:
3057; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3058; GFX12-NEXT:    s_wait_expcnt 0x0
3059; GFX12-NEXT:    s_wait_samplecnt 0x0
3060; GFX12-NEXT:    s_wait_bvhcnt 0x0
3061; GFX12-NEXT:    s_wait_kmcnt 0x0
3062; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
3063; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
3064; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[4:5]
3065; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[6:7], v[6:7]
3066; GFX12-NEXT:    s_setpc_b64 s[30:31]
3067  %canon = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> %arg)
3068  ret <4 x double> %canon
3069}
3070
3071attributes #0 = { nounwind readnone }
3072attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
3073attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
3074attributes #3 = { nounwind "denormal-fp-math"="ieee,ieee" }
3075attributes #4 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
3076attributes #5 = { nounwind "denormal-fp-math-f32"="dynamic,dynamic" }
3077attributes #6 = { nounwind "denormal-fp-math-f32"="dynamic,ieee" }
3078attributes #7 = { nounwind "denormal-fp-math-f32"="ieee,dynamic" }
3079