1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX678,GFX6 %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX678,GFX8 %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s 6; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s 7 8declare float @llvm.fabs.f32(float) #0 9declare float @llvm.canonicalize.f32(float) #0 10declare <2 x float> @llvm.canonicalize.v2f32(<2 x float>) #0 11declare <3 x float> @llvm.canonicalize.v3f32(<3 x float>) #0 12declare <4 x float> @llvm.canonicalize.v4f32(<4 x float>) #0 13declare <8 x float> @llvm.canonicalize.v8f32(<8 x float>) #0 14declare double @llvm.fabs.f64(double) #0 15declare double @llvm.canonicalize.f64(double) #0 16declare <2 x double> @llvm.canonicalize.v2f64(<2 x double>) #0 17declare <3 x double> @llvm.canonicalize.v3f64(<3 x double>) #0 18declare <4 x double> @llvm.canonicalize.v4f64(<4 x double>) #0 19declare half @llvm.canonicalize.f16(half) #0 20declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0 21declare i32 @llvm.amdgcn.workitem.id.x() #0 22 23define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 { 24; GFX678-LABEL: v_test_canonicalize_var_f32: 25; GFX678: ; %bb.0: 26; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 27; GFX678-NEXT: s_waitcnt lgkmcnt(0) 28; GFX678-NEXT: v_mov_b32_e32 v0, s0 29; GFX678-NEXT: v_mov_b32_e32 v1, s1 30; GFX678-NEXT: flat_load_dword v2, v[0:1] 31; GFX678-NEXT: s_waitcnt vmcnt(0) 32; GFX678-NEXT: v_mul_f32_e32 v2, 1.0, v2 33; GFX678-NEXT: flat_store_dword v[0:1], v2 34; GFX678-NEXT: s_endpgm 35; 36; GFX9-LABEL: v_test_canonicalize_var_f32: 37; GFX9: ; %bb.0: 38; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 39; GFX9-NEXT: v_mov_b32_e32 v0, 0 40; GFX9-NEXT: s_waitcnt lgkmcnt(0) 41; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 42; GFX9-NEXT: s_waitcnt vmcnt(0) 43; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 44; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 45; GFX9-NEXT: s_endpgm 46; 47; GFX11-LABEL: v_test_canonicalize_var_f32: 48; GFX11: ; %bb.0: 49; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 50; GFX11-NEXT: v_mov_b32_e32 v0, 0 51; GFX11-NEXT: s_waitcnt lgkmcnt(0) 52; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] 53; GFX11-NEXT: s_waitcnt vmcnt(0) 54; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 55; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 56; GFX11-NEXT: s_endpgm 57; 58; GFX12-LABEL: v_test_canonicalize_var_f32: 59; GFX12: ; %bb.0: 60; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 61; GFX12-NEXT: v_mov_b32_e32 v0, 0 62; GFX12-NEXT: s_wait_kmcnt 0x0 63; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] 64; GFX12-NEXT: s_wait_loadcnt 0x0 65; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 66; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 67; GFX12-NEXT: s_endpgm 68 %val = load float, ptr addrspace(1) %out 69 %canonicalized = call float @llvm.canonicalize.f32(float %val) 70 store float %canonicalized, ptr addrspace(1) %out 71 ret void 72} 73 74define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, float %val) #1 { 75; GFX6-LABEL: s_test_canonicalize_var_f32: 76; GFX6: ; %bb.0: 77; GFX6-NEXT: s_load_dword s2, s[8:9], 0x2 78; GFX6-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 79; GFX6-NEXT: s_waitcnt lgkmcnt(0) 80; GFX6-NEXT: v_mul_f32_e64 v2, 1.0, s2 81; GFX6-NEXT: v_mov_b32_e32 v0, s0 82; GFX6-NEXT: v_mov_b32_e32 v1, s1 83; GFX6-NEXT: flat_store_dword v[0:1], v2 84; GFX6-NEXT: s_endpgm 85; 86; GFX8-LABEL: s_test_canonicalize_var_f32: 87; GFX8: ; %bb.0: 88; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8 89; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 90; GFX8-NEXT: s_waitcnt lgkmcnt(0) 91; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s2 92; GFX8-NEXT: v_mov_b32_e32 v0, s0 93; GFX8-NEXT: v_mov_b32_e32 v1, s1 94; GFX8-NEXT: flat_store_dword v[0:1], v2 95; GFX8-NEXT: s_endpgm 96; 97; GFX9-LABEL: s_test_canonicalize_var_f32: 98; GFX9: ; %bb.0: 99; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 100; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 101; GFX9-NEXT: v_mov_b32_e32 v0, 0 102; GFX9-NEXT: s_waitcnt lgkmcnt(0) 103; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 104; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 105; GFX9-NEXT: s_endpgm 106; 107; GFX11-LABEL: s_test_canonicalize_var_f32: 108; GFX11: ; %bb.0: 109; GFX11-NEXT: s_clause 0x1 110; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 111; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 112; GFX11-NEXT: v_mov_b32_e32 v0, 0 113; GFX11-NEXT: s_waitcnt lgkmcnt(0) 114; GFX11-NEXT: v_max_f32_e64 v1, s2, s2 115; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 116; GFX11-NEXT: s_endpgm 117; 118; GFX12-LABEL: s_test_canonicalize_var_f32: 119; GFX12: ; %bb.0: 120; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 121; GFX12-NEXT: v_mov_b32_e32 v0, 0 122; GFX12-NEXT: s_wait_kmcnt 0x0 123; GFX12-NEXT: v_max_num_f32_e64 v1, s2, s2 124; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 125; GFX12-NEXT: s_endpgm 126 %canonicalized = call float @llvm.canonicalize.f32(float %val) 127 store float %canonicalized, ptr addrspace(1) %out 128 ret void 129} 130 131define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %out) #1 { 132; GFX678-LABEL: v_test_canonicalize_fabs_var_f32: 133; GFX678: ; %bb.0: 134; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 135; GFX678-NEXT: s_waitcnt lgkmcnt(0) 136; GFX678-NEXT: v_mov_b32_e32 v0, s0 137; GFX678-NEXT: v_mov_b32_e32 v1, s1 138; GFX678-NEXT: flat_load_dword v2, v[0:1] 139; GFX678-NEXT: s_waitcnt vmcnt(0) 140; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, |v2| 141; GFX678-NEXT: flat_store_dword v[0:1], v2 142; GFX678-NEXT: s_endpgm 143; 144; GFX9-LABEL: v_test_canonicalize_fabs_var_f32: 145; GFX9: ; %bb.0: 146; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 147; GFX9-NEXT: v_mov_b32_e32 v0, 0 148; GFX9-NEXT: s_waitcnt lgkmcnt(0) 149; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 150; GFX9-NEXT: s_waitcnt vmcnt(0) 151; GFX9-NEXT: v_max_f32_e64 v1, |v1|, |v1| 152; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 153; GFX9-NEXT: s_endpgm 154; 155; GFX11-LABEL: v_test_canonicalize_fabs_var_f32: 156; GFX11: ; %bb.0: 157; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 158; GFX11-NEXT: v_mov_b32_e32 v0, 0 159; GFX11-NEXT: s_waitcnt lgkmcnt(0) 160; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] 161; GFX11-NEXT: s_waitcnt vmcnt(0) 162; GFX11-NEXT: v_max_f32_e64 v1, |v1|, |v1| 163; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 164; GFX11-NEXT: s_endpgm 165; 166; GFX12-LABEL: v_test_canonicalize_fabs_var_f32: 167; GFX12: ; %bb.0: 168; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 169; GFX12-NEXT: v_mov_b32_e32 v0, 0 170; GFX12-NEXT: s_wait_kmcnt 0x0 171; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] 172; GFX12-NEXT: s_wait_loadcnt 0x0 173; GFX12-NEXT: v_max_num_f32_e64 v1, |v1|, |v1| 174; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 175; GFX12-NEXT: s_endpgm 176 %val = load float, ptr addrspace(1) %out 177 %val.fabs = call float @llvm.fabs.f32(float %val) 178 %canonicalized = call float @llvm.canonicalize.f32(float %val.fabs) 179 store float %canonicalized, ptr addrspace(1) %out 180 ret void 181} 182 183define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1) %out) #1 { 184; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32: 185; GFX678: ; %bb.0: 186; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 187; GFX678-NEXT: s_waitcnt lgkmcnt(0) 188; GFX678-NEXT: v_mov_b32_e32 v0, s0 189; GFX678-NEXT: v_mov_b32_e32 v1, s1 190; GFX678-NEXT: flat_load_dword v2, v[0:1] 191; GFX678-NEXT: s_waitcnt vmcnt(0) 192; GFX678-NEXT: v_mul_f32_e64 v2, -1.0, |v2| 193; GFX678-NEXT: flat_store_dword v[0:1], v2 194; GFX678-NEXT: s_endpgm 195; 196; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f32: 197; GFX9: ; %bb.0: 198; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 199; GFX9-NEXT: v_mov_b32_e32 v0, 0 200; GFX9-NEXT: s_waitcnt lgkmcnt(0) 201; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 202; GFX9-NEXT: s_waitcnt vmcnt(0) 203; GFX9-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| 204; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 205; GFX9-NEXT: s_endpgm 206; 207; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f32: 208; GFX11: ; %bb.0: 209; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 210; GFX11-NEXT: v_mov_b32_e32 v0, 0 211; GFX11-NEXT: s_waitcnt lgkmcnt(0) 212; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] 213; GFX11-NEXT: s_waitcnt vmcnt(0) 214; GFX11-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| 215; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 216; GFX11-NEXT: s_endpgm 217; 218; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f32: 219; GFX12: ; %bb.0: 220; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 221; GFX12-NEXT: v_mov_b32_e32 v0, 0 222; GFX12-NEXT: s_wait_kmcnt 0x0 223; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] 224; GFX12-NEXT: s_wait_loadcnt 0x0 225; GFX12-NEXT: v_max_num_f32_e64 v1, -|v1|, -|v1| 226; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 227; GFX12-NEXT: s_endpgm 228 %val = load float, ptr addrspace(1) %out 229 %val.fabs = call float @llvm.fabs.f32(float %val) 230 %val.fabs.fneg = fneg float %val.fabs 231 %canonicalized = call float @llvm.canonicalize.f32(float %val.fabs.fneg) 232 store float %canonicalized, ptr addrspace(1) %out 233 ret void 234} 235 236define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %out) #1 { 237; GFX678-LABEL: v_test_canonicalize_fneg_var_f32: 238; GFX678: ; %bb.0: 239; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 240; GFX678-NEXT: s_waitcnt lgkmcnt(0) 241; GFX678-NEXT: v_mov_b32_e32 v0, s0 242; GFX678-NEXT: v_mov_b32_e32 v1, s1 243; GFX678-NEXT: flat_load_dword v2, v[0:1] 244; GFX678-NEXT: s_waitcnt vmcnt(0) 245; GFX678-NEXT: v_mul_f32_e32 v2, -1.0, v2 246; GFX678-NEXT: flat_store_dword v[0:1], v2 247; GFX678-NEXT: s_endpgm 248; 249; GFX9-LABEL: v_test_canonicalize_fneg_var_f32: 250; GFX9: ; %bb.0: 251; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 252; GFX9-NEXT: v_mov_b32_e32 v0, 0 253; GFX9-NEXT: s_waitcnt lgkmcnt(0) 254; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 255; GFX9-NEXT: s_waitcnt vmcnt(0) 256; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 257; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 258; GFX9-NEXT: s_endpgm 259; 260; GFX11-LABEL: v_test_canonicalize_fneg_var_f32: 261; GFX11: ; %bb.0: 262; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 263; GFX11-NEXT: v_mov_b32_e32 v0, 0 264; GFX11-NEXT: s_waitcnt lgkmcnt(0) 265; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] 266; GFX11-NEXT: s_waitcnt vmcnt(0) 267; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 268; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 269; GFX11-NEXT: s_endpgm 270; 271; GFX12-LABEL: v_test_canonicalize_fneg_var_f32: 272; GFX12: ; %bb.0: 273; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 274; GFX12-NEXT: v_mov_b32_e32 v0, 0 275; GFX12-NEXT: s_wait_kmcnt 0x0 276; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] 277; GFX12-NEXT: s_wait_loadcnt 0x0 278; GFX12-NEXT: v_max_num_f32_e64 v1, -v1, -v1 279; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 280; GFX12-NEXT: s_endpgm 281 %val = load float, ptr addrspace(1) %out 282 %val.fneg = fneg float %val 283 %canonicalized = call float @llvm.canonicalize.f32(float %val.fneg) 284 store float %canonicalized, ptr addrspace(1) %out 285 ret void 286} 287 288define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %out) #1 { 289; GFX678-LABEL: test_fold_canonicalize_undef_f32: 290; GFX678: ; %bb.0: 291; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 292; GFX678-NEXT: v_mov_b32_e32 v2, 0 293; GFX678-NEXT: s_waitcnt lgkmcnt(0) 294; GFX678-NEXT: v_mov_b32_e32 v0, s0 295; GFX678-NEXT: v_mov_b32_e32 v1, s1 296; GFX678-NEXT: flat_store_dword v[0:1], v2 297; GFX678-NEXT: s_endpgm 298; 299; GFX9-LABEL: test_fold_canonicalize_undef_f32: 300; GFX9: ; %bb.0: 301; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 302; GFX9-NEXT: v_mov_b32_e32 v0, 0 303; GFX9-NEXT: s_waitcnt lgkmcnt(0) 304; GFX9-NEXT: global_store_dword v0, v0, s[0:1] 305; GFX9-NEXT: s_endpgm 306; 307; GFX11-LABEL: test_fold_canonicalize_undef_f32: 308; GFX11: ; %bb.0: 309; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 310; GFX11-NEXT: v_mov_b32_e32 v0, 0 311; GFX11-NEXT: s_waitcnt lgkmcnt(0) 312; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] 313; GFX11-NEXT: s_endpgm 314; 315; GFX12-LABEL: test_fold_canonicalize_undef_f32: 316; GFX12: ; %bb.0: 317; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 318; GFX12-NEXT: v_mov_b32_e32 v0, 0 319; GFX12-NEXT: s_wait_kmcnt 0x0 320; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] 321; GFX12-NEXT: s_endpgm 322 %canonicalized = call float @llvm.canonicalize.f32(float undef) 323 store float %canonicalized, ptr addrspace(1) %out 324 ret void 325} 326 327define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) #1 { 328; GFX678-LABEL: test_fold_canonicalize_p0_f32: 329; GFX678: ; %bb.0: 330; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 331; GFX678-NEXT: v_mov_b32_e32 v2, 0 332; GFX678-NEXT: s_waitcnt lgkmcnt(0) 333; GFX678-NEXT: v_mov_b32_e32 v0, s0 334; GFX678-NEXT: v_mov_b32_e32 v1, s1 335; GFX678-NEXT: flat_store_dword v[0:1], v2 336; GFX678-NEXT: s_endpgm 337; 338; GFX9-LABEL: test_fold_canonicalize_p0_f32: 339; GFX9: ; %bb.0: 340; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 341; GFX9-NEXT: v_mov_b32_e32 v0, 0 342; GFX9-NEXT: s_waitcnt lgkmcnt(0) 343; GFX9-NEXT: global_store_dword v0, v0, s[0:1] 344; GFX9-NEXT: s_endpgm 345; 346; GFX11-LABEL: test_fold_canonicalize_p0_f32: 347; GFX11: ; %bb.0: 348; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 349; GFX11-NEXT: v_mov_b32_e32 v0, 0 350; GFX11-NEXT: s_waitcnt lgkmcnt(0) 351; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] 352; GFX11-NEXT: s_endpgm 353; 354; GFX12-LABEL: test_fold_canonicalize_p0_f32: 355; GFX12: ; %bb.0: 356; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 357; GFX12-NEXT: v_mov_b32_e32 v0, 0 358; GFX12-NEXT: s_wait_kmcnt 0x0 359; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] 360; GFX12-NEXT: s_endpgm 361 %canonicalized = call float @llvm.canonicalize.f32(float 0.0) 362 store float %canonicalized, ptr addrspace(1) %out 363 ret void 364} 365 366define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) #1 { 367; GFX678-LABEL: test_fold_canonicalize_n0_f32: 368; GFX678: ; %bb.0: 369; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 370; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 371; GFX678-NEXT: s_waitcnt lgkmcnt(0) 372; GFX678-NEXT: v_mov_b32_e32 v0, s0 373; GFX678-NEXT: v_mov_b32_e32 v1, s1 374; GFX678-NEXT: flat_store_dword v[0:1], v2 375; GFX678-NEXT: s_endpgm 376; 377; GFX9-LABEL: test_fold_canonicalize_n0_f32: 378; GFX9: ; %bb.0: 379; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 380; GFX9-NEXT: v_mov_b32_e32 v0, 0 381; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 382; GFX9-NEXT: s_waitcnt lgkmcnt(0) 383; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 384; GFX9-NEXT: s_endpgm 385; 386; GFX11-LABEL: test_fold_canonicalize_n0_f32: 387; GFX11: ; %bb.0: 388; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 389; GFX11-NEXT: v_mov_b32_e32 v0, 0 390; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 391; GFX11-NEXT: s_waitcnt lgkmcnt(0) 392; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 393; GFX11-NEXT: s_endpgm 394; 395; GFX12-LABEL: test_fold_canonicalize_n0_f32: 396; GFX12: ; %bb.0: 397; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 398; GFX12-NEXT: v_mov_b32_e32 v0, 0 399; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 400; GFX12-NEXT: s_wait_kmcnt 0x0 401; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 402; GFX12-NEXT: s_endpgm 403 %canonicalized = call float @llvm.canonicalize.f32(float -0.0) 404 store float %canonicalized, ptr addrspace(1) %out 405 ret void 406} 407 408define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) #1 { 409; GFX678-LABEL: test_fold_canonicalize_p1_f32: 410; GFX678: ; %bb.0: 411; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 412; GFX678-NEXT: v_mov_b32_e32 v2, 1.0 413; GFX678-NEXT: s_waitcnt lgkmcnt(0) 414; GFX678-NEXT: v_mov_b32_e32 v0, s0 415; GFX678-NEXT: v_mov_b32_e32 v1, s1 416; GFX678-NEXT: flat_store_dword v[0:1], v2 417; GFX678-NEXT: s_endpgm 418; 419; GFX9-LABEL: test_fold_canonicalize_p1_f32: 420; GFX9: ; %bb.0: 421; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 422; GFX9-NEXT: v_mov_b32_e32 v0, 0 423; GFX9-NEXT: v_mov_b32_e32 v1, 1.0 424; GFX9-NEXT: s_waitcnt lgkmcnt(0) 425; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 426; GFX9-NEXT: s_endpgm 427; 428; GFX11-LABEL: test_fold_canonicalize_p1_f32: 429; GFX11: ; %bb.0: 430; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 431; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 432; GFX11-NEXT: s_waitcnt lgkmcnt(0) 433; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 434; GFX11-NEXT: s_endpgm 435; 436; GFX12-LABEL: test_fold_canonicalize_p1_f32: 437; GFX12: ; %bb.0: 438; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 439; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 440; GFX12-NEXT: s_wait_kmcnt 0x0 441; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 442; GFX12-NEXT: s_endpgm 443 %canonicalized = call float @llvm.canonicalize.f32(float 1.0) 444 store float %canonicalized, ptr addrspace(1) %out 445 ret void 446} 447 448define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) #1 { 449; GFX678-LABEL: test_fold_canonicalize_n1_f32: 450; GFX678: ; %bb.0: 451; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 452; GFX678-NEXT: v_mov_b32_e32 v2, -1.0 453; GFX678-NEXT: s_waitcnt lgkmcnt(0) 454; GFX678-NEXT: v_mov_b32_e32 v0, s0 455; GFX678-NEXT: v_mov_b32_e32 v1, s1 456; GFX678-NEXT: flat_store_dword v[0:1], v2 457; GFX678-NEXT: s_endpgm 458; 459; GFX9-LABEL: test_fold_canonicalize_n1_f32: 460; GFX9: ; %bb.0: 461; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 462; GFX9-NEXT: v_mov_b32_e32 v0, 0 463; GFX9-NEXT: v_mov_b32_e32 v1, -1.0 464; GFX9-NEXT: s_waitcnt lgkmcnt(0) 465; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 466; GFX9-NEXT: s_endpgm 467; 468; GFX11-LABEL: test_fold_canonicalize_n1_f32: 469; GFX11: ; %bb.0: 470; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 471; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0 472; GFX11-NEXT: s_waitcnt lgkmcnt(0) 473; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 474; GFX11-NEXT: s_endpgm 475; 476; GFX12-LABEL: test_fold_canonicalize_n1_f32: 477; GFX12: ; %bb.0: 478; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 479; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0 480; GFX12-NEXT: s_wait_kmcnt 0x0 481; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 482; GFX12-NEXT: s_endpgm 483 %canonicalized = call float @llvm.canonicalize.f32(float -1.0) 484 store float %canonicalized, ptr addrspace(1) %out 485 ret void 486} 487 488define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %out) #1 { 489; GFX678-LABEL: test_fold_canonicalize_literal_f32: 490; GFX678: ; %bb.0: 491; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 492; GFX678-NEXT: v_mov_b32_e32 v2, 0x41800000 493; GFX678-NEXT: s_waitcnt lgkmcnt(0) 494; GFX678-NEXT: v_mov_b32_e32 v0, s0 495; GFX678-NEXT: v_mov_b32_e32 v1, s1 496; GFX678-NEXT: flat_store_dword v[0:1], v2 497; GFX678-NEXT: s_endpgm 498; 499; GFX9-LABEL: test_fold_canonicalize_literal_f32: 500; GFX9: ; %bb.0: 501; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 502; GFX9-NEXT: v_mov_b32_e32 v0, 0 503; GFX9-NEXT: v_mov_b32_e32 v1, 0x41800000 504; GFX9-NEXT: s_waitcnt lgkmcnt(0) 505; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 506; GFX9-NEXT: s_endpgm 507; 508; GFX11-LABEL: test_fold_canonicalize_literal_f32: 509; GFX11: ; %bb.0: 510; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 511; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000 512; GFX11-NEXT: s_waitcnt lgkmcnt(0) 513; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 514; GFX11-NEXT: s_endpgm 515; 516; GFX12-LABEL: test_fold_canonicalize_literal_f32: 517; GFX12: ; %bb.0: 518; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 519; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000 520; GFX12-NEXT: s_wait_kmcnt 0x0 521; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 522; GFX12-NEXT: s_endpgm 523 %canonicalized = call float @llvm.canonicalize.f32(float 16.0) 524 store float %canonicalized, ptr addrspace(1) %out 525 ret void 526} 527 528define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #1 { 529; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: 530; GFX678: ; %bb.0: 531; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 532; GFX678-NEXT: v_mov_b32_e32 v2, 0 533; GFX678-NEXT: s_waitcnt lgkmcnt(0) 534; GFX678-NEXT: v_mov_b32_e32 v0, s0 535; GFX678-NEXT: v_mov_b32_e32 v1, s1 536; GFX678-NEXT: flat_store_dword v[0:1], v2 537; GFX678-NEXT: s_endpgm 538; 539; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: 540; GFX9: ; %bb.0: 541; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 542; GFX9-NEXT: v_mov_b32_e32 v0, 0 543; GFX9-NEXT: s_waitcnt lgkmcnt(0) 544; GFX9-NEXT: global_store_dword v0, v0, s[0:1] 545; GFX9-NEXT: s_endpgm 546; 547; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: 548; GFX11: ; %bb.0: 549; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 550; GFX11-NEXT: v_mov_b32_e32 v0, 0 551; GFX11-NEXT: s_waitcnt lgkmcnt(0) 552; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] 553; GFX11-NEXT: s_endpgm 554; 555; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: 556; GFX12: ; %bb.0: 557; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 558; GFX12-NEXT: v_mov_b32_e32 v0, 0 559; GFX12-NEXT: s_wait_kmcnt 0x0 560; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] 561; GFX12-NEXT: s_endpgm 562 %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) 563 store float %canonicalized, ptr addrspace(1) %out 564 ret void 565} 566 567define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic(ptr addrspace(1) %out) #5 { 568; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: 569; GFX678: ; %bb.0: 570; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 571; GFX678-NEXT: s_mov_b32 s2, 0x7fffff 572; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 573; GFX678-NEXT: s_waitcnt lgkmcnt(0) 574; GFX678-NEXT: v_mov_b32_e32 v0, s0 575; GFX678-NEXT: v_mov_b32_e32 v1, s1 576; GFX678-NEXT: flat_store_dword v[0:1], v2 577; GFX678-NEXT: s_endpgm 578; 579; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: 580; GFX9: ; %bb.0: 581; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 582; GFX9-NEXT: s_mov_b32 s2, 0x7fffff 583; GFX9-NEXT: v_mov_b32_e32 v0, 0 584; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 585; GFX9-NEXT: s_waitcnt lgkmcnt(0) 586; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 587; GFX9-NEXT: s_endpgm 588; 589; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: 590; GFX11: ; %bb.0: 591; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 592; GFX11-NEXT: v_mov_b32_e32 v0, 0 593; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff 594; GFX11-NEXT: s_waitcnt lgkmcnt(0) 595; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 596; GFX11-NEXT: s_endpgm 597; 598; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: 599; GFX12: ; %bb.0: 600; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 601; GFX12-NEXT: v_mov_b32_e32 v0, 0 602; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff 603; GFX12-NEXT: s_wait_kmcnt 0x0 604; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 605; GFX12-NEXT: s_endpgm 606 %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) 607 store float %canonicalized, ptr addrspace(1) %out 608 ret void 609} 610 611define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out(ptr addrspace(1) %out) #6 { 612; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: 613; GFX678: ; %bb.0: 614; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 615; GFX678-NEXT: s_mov_b32 s2, 0x7fffff 616; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 617; GFX678-NEXT: s_waitcnt lgkmcnt(0) 618; GFX678-NEXT: v_mov_b32_e32 v0, s0 619; GFX678-NEXT: v_mov_b32_e32 v1, s1 620; GFX678-NEXT: flat_store_dword v[0:1], v2 621; GFX678-NEXT: s_endpgm 622; 623; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: 624; GFX9: ; %bb.0: 625; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 626; GFX9-NEXT: s_mov_b32 s2, 0x7fffff 627; GFX9-NEXT: v_mov_b32_e32 v0, 0 628; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 629; GFX9-NEXT: s_waitcnt lgkmcnt(0) 630; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 631; GFX9-NEXT: s_endpgm 632; 633; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: 634; GFX11: ; %bb.0: 635; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 636; GFX11-NEXT: v_mov_b32_e32 v0, 0 637; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff 638; GFX11-NEXT: s_waitcnt lgkmcnt(0) 639; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 640; GFX11-NEXT: s_endpgm 641; 642; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: 643; GFX12: ; %bb.0: 644; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 645; GFX12-NEXT: v_mov_b32_e32 v0, 0 646; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff 647; GFX12-NEXT: s_wait_kmcnt 0x0 648; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 649; GFX12-NEXT: s_endpgm 650 %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) 651 store float %canonicalized, ptr addrspace(1) %out 652 ret void 653} 654 655define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in(ptr addrspace(1) %out) #7 { 656; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: 657; GFX678: ; %bb.0: 658; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 659; GFX678-NEXT: s_mov_b32 s2, 0x7fffff 660; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 661; GFX678-NEXT: s_waitcnt lgkmcnt(0) 662; GFX678-NEXT: v_mov_b32_e32 v0, s0 663; GFX678-NEXT: v_mov_b32_e32 v1, s1 664; GFX678-NEXT: flat_store_dword v[0:1], v2 665; GFX678-NEXT: s_endpgm 666; 667; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: 668; GFX9: ; %bb.0: 669; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 670; GFX9-NEXT: s_mov_b32 s2, 0x7fffff 671; GFX9-NEXT: v_mov_b32_e32 v0, 0 672; GFX9-NEXT: v_max_f32_e64 v1, s2, s2 673; GFX9-NEXT: s_waitcnt lgkmcnt(0) 674; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 675; GFX9-NEXT: s_endpgm 676; 677; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: 678; GFX11: ; %bb.0: 679; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 680; GFX11-NEXT: v_mov_b32_e32 v0, 0 681; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff 682; GFX11-NEXT: s_waitcnt lgkmcnt(0) 683; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 684; GFX11-NEXT: s_endpgm 685; 686; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: 687; GFX12: ; %bb.0: 688; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 689; GFX12-NEXT: v_mov_b32_e32 v0, 0 690; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff 691; GFX12-NEXT: s_wait_kmcnt 0x0 692; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 693; GFX12-NEXT: s_endpgm 694 %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) 695 store float %canonicalized, ptr addrspace(1) %out 696 ret void 697} 698 699define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #3 { 700; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32: 701; GFX678: ; %bb.0: 702; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 703; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fffff 704; GFX678-NEXT: s_waitcnt lgkmcnt(0) 705; GFX678-NEXT: v_mov_b32_e32 v0, s0 706; GFX678-NEXT: v_mov_b32_e32 v1, s1 707; GFX678-NEXT: flat_store_dword v[0:1], v2 708; GFX678-NEXT: s_endpgm 709; 710; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f32: 711; GFX9: ; %bb.0: 712; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 713; GFX9-NEXT: v_mov_b32_e32 v0, 0 714; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fffff 715; GFX9-NEXT: s_waitcnt lgkmcnt(0) 716; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 717; GFX9-NEXT: s_endpgm 718; 719; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f32: 720; GFX11: ; %bb.0: 721; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 722; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff 723; GFX11-NEXT: s_waitcnt lgkmcnt(0) 724; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 725; GFX11-NEXT: s_endpgm 726; 727; GFX12-LABEL: test_denormals_fold_canonicalize_denormal0_f32: 728; GFX12: ; %bb.0: 729; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 730; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff 731; GFX12-NEXT: s_wait_kmcnt 0x0 732; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 733; GFX12-NEXT: s_endpgm 734 %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) 735 store float %canonicalized, ptr addrspace(1) %out 736 ret void 737} 738 739define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr addrspace(1) %out) #1 { 740; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: 741; GFX678: ; %bb.0: 742; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 743; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 744; GFX678-NEXT: s_waitcnt lgkmcnt(0) 745; GFX678-NEXT: v_mov_b32_e32 v0, s0 746; GFX678-NEXT: v_mov_b32_e32 v1, s1 747; GFX678-NEXT: flat_store_dword v[0:1], v2 748; GFX678-NEXT: s_endpgm 749; 750; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: 751; GFX9: ; %bb.0: 752; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 753; GFX9-NEXT: v_mov_b32_e32 v0, 0 754; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 755; GFX9-NEXT: s_waitcnt lgkmcnt(0) 756; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 757; GFX9-NEXT: s_endpgm 758; 759; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: 760; GFX11: ; %bb.0: 761; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 762; GFX11-NEXT: v_mov_b32_e32 v0, 0 763; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 764; GFX11-NEXT: s_waitcnt lgkmcnt(0) 765; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 766; GFX11-NEXT: s_endpgm 767; 768; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: 769; GFX12: ; %bb.0: 770; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 771; GFX12-NEXT: v_mov_b32_e32 v0, 0 772; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 773; GFX12-NEXT: s_wait_kmcnt 0x0 774; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 775; GFX12-NEXT: s_endpgm 776 %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float)) 777 store float %canonicalized, ptr addrspace(1) %out 778 ret void 779} 780 781define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr addrspace(1) %out) #3 { 782; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32: 783; GFX678: ; %bb.0: 784; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 785; GFX678-NEXT: v_mov_b32_e32 v2, 0x807fffff 786; GFX678-NEXT: s_waitcnt lgkmcnt(0) 787; GFX678-NEXT: v_mov_b32_e32 v0, s0 788; GFX678-NEXT: v_mov_b32_e32 v1, s1 789; GFX678-NEXT: flat_store_dword v[0:1], v2 790; GFX678-NEXT: s_endpgm 791; 792; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f32: 793; GFX9: ; %bb.0: 794; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 795; GFX9-NEXT: v_mov_b32_e32 v0, 0 796; GFX9-NEXT: v_mov_b32_e32 v1, 0x807fffff 797; GFX9-NEXT: s_waitcnt lgkmcnt(0) 798; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 799; GFX9-NEXT: s_endpgm 800; 801; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f32: 802; GFX11: ; %bb.0: 803; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 804; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff 805; GFX11-NEXT: s_waitcnt lgkmcnt(0) 806; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 807; GFX11-NEXT: s_endpgm 808; 809; GFX12-LABEL: test_denormals_fold_canonicalize_denormal1_f32: 810; GFX12: ; %bb.0: 811; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 812; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff 813; GFX12-NEXT: s_wait_kmcnt 0x0 814; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 815; GFX12-NEXT: s_endpgm 816 %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float)) 817 store float %canonicalized, ptr addrspace(1) %out 818 ret void 819} 820 821define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out) #1 { 822; GFX678-LABEL: test_fold_canonicalize_qnan_f32: 823; GFX678: ; %bb.0: 824; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 825; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 826; GFX678-NEXT: s_waitcnt lgkmcnt(0) 827; GFX678-NEXT: v_mov_b32_e32 v0, s0 828; GFX678-NEXT: v_mov_b32_e32 v1, s1 829; GFX678-NEXT: flat_store_dword v[0:1], v2 830; GFX678-NEXT: s_endpgm 831; 832; GFX9-LABEL: test_fold_canonicalize_qnan_f32: 833; GFX9: ; %bb.0: 834; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 835; GFX9-NEXT: v_mov_b32_e32 v0, 0 836; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 837; GFX9-NEXT: s_waitcnt lgkmcnt(0) 838; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 839; GFX9-NEXT: s_endpgm 840; 841; GFX11-LABEL: test_fold_canonicalize_qnan_f32: 842; GFX11: ; %bb.0: 843; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 844; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 845; GFX11-NEXT: s_waitcnt lgkmcnt(0) 846; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 847; GFX11-NEXT: s_endpgm 848; 849; GFX12-LABEL: test_fold_canonicalize_qnan_f32: 850; GFX12: ; %bb.0: 851; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 852; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 853; GFX12-NEXT: s_wait_kmcnt 0x0 854; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 855; GFX12-NEXT: s_endpgm 856 %canonicalized = call float @llvm.canonicalize.f32(float 0x7FF8000000000000) 857 store float %canonicalized, ptr addrspace(1) %out 858 ret void 859} 860 861define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrspace(1) %out) #1 { 862; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: 863; GFX678: ; %bb.0: 864; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 865; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 866; GFX678-NEXT: s_waitcnt lgkmcnt(0) 867; GFX678-NEXT: v_mov_b32_e32 v0, s0 868; GFX678-NEXT: v_mov_b32_e32 v1, s1 869; GFX678-NEXT: flat_store_dword v[0:1], v2 870; GFX678-NEXT: s_endpgm 871; 872; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: 873; GFX9: ; %bb.0: 874; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 875; GFX9-NEXT: v_mov_b32_e32 v0, 0 876; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 877; GFX9-NEXT: s_waitcnt lgkmcnt(0) 878; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 879; GFX9-NEXT: s_endpgm 880; 881; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: 882; GFX11: ; %bb.0: 883; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 884; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 885; GFX11-NEXT: s_waitcnt lgkmcnt(0) 886; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 887; GFX11-NEXT: s_endpgm 888; 889; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: 890; GFX12: ; %bb.0: 891; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 892; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 893; GFX12-NEXT: s_wait_kmcnt 0x0 894; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 895; GFX12-NEXT: s_endpgm 896 %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -1 to float)) 897 store float %canonicalized, ptr addrspace(1) %out 898 ret void 899} 900 901define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrspace(1) %out) #1 { 902; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: 903; GFX678: ; %bb.0: 904; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 905; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 906; GFX678-NEXT: s_waitcnt lgkmcnt(0) 907; GFX678-NEXT: v_mov_b32_e32 v0, s0 908; GFX678-NEXT: v_mov_b32_e32 v1, s1 909; GFX678-NEXT: flat_store_dword v[0:1], v2 910; GFX678-NEXT: s_endpgm 911; 912; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: 913; GFX9: ; %bb.0: 914; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 915; GFX9-NEXT: v_mov_b32_e32 v0, 0 916; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 917; GFX9-NEXT: s_waitcnt lgkmcnt(0) 918; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 919; GFX9-NEXT: s_endpgm 920; 921; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: 922; GFX11: ; %bb.0: 923; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 924; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 925; GFX11-NEXT: s_waitcnt lgkmcnt(0) 926; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 927; GFX11-NEXT: s_endpgm 928; 929; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: 930; GFX12: ; %bb.0: 931; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 932; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 933; GFX12-NEXT: s_wait_kmcnt 0x0 934; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 935; GFX12-NEXT: s_endpgm 936 %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -2 to float)) 937 store float %canonicalized, ptr addrspace(1) %out 938 ret void 939} 940 941define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace(1) %out) #1 { 942; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32: 943; GFX678: ; %bb.0: 944; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 945; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 946; GFX678-NEXT: s_waitcnt lgkmcnt(0) 947; GFX678-NEXT: v_mov_b32_e32 v0, s0 948; GFX678-NEXT: v_mov_b32_e32 v1, s1 949; GFX678-NEXT: flat_store_dword v[0:1], v2 950; GFX678-NEXT: s_endpgm 951; 952; GFX9-LABEL: test_fold_canonicalize_snan0_value_f32: 953; GFX9: ; %bb.0: 954; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 955; GFX9-NEXT: v_mov_b32_e32 v0, 0 956; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 957; GFX9-NEXT: s_waitcnt lgkmcnt(0) 958; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 959; GFX9-NEXT: s_endpgm 960; 961; GFX11-LABEL: test_fold_canonicalize_snan0_value_f32: 962; GFX11: ; %bb.0: 963; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 964; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 965; GFX11-NEXT: s_waitcnt lgkmcnt(0) 966; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 967; GFX11-NEXT: s_endpgm 968; 969; GFX12-LABEL: test_fold_canonicalize_snan0_value_f32: 970; GFX12: ; %bb.0: 971; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 972; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 973; GFX12-NEXT: s_wait_kmcnt 0x0 974; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 975; GFX12-NEXT: s_endpgm 976 %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2139095041 to float)) 977 store float %canonicalized, ptr addrspace(1) %out 978 ret void 979} 980 981define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace(1) %out) #1 { 982; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32: 983; GFX678: ; %bb.0: 984; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 985; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 986; GFX678-NEXT: s_waitcnt lgkmcnt(0) 987; GFX678-NEXT: v_mov_b32_e32 v0, s0 988; GFX678-NEXT: v_mov_b32_e32 v1, s1 989; GFX678-NEXT: flat_store_dword v[0:1], v2 990; GFX678-NEXT: s_endpgm 991; 992; GFX9-LABEL: test_fold_canonicalize_snan1_value_f32: 993; GFX9: ; %bb.0: 994; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 995; GFX9-NEXT: v_mov_b32_e32 v0, 0 996; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 997; GFX9-NEXT: s_waitcnt lgkmcnt(0) 998; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 999; GFX9-NEXT: s_endpgm 1000; 1001; GFX11-LABEL: test_fold_canonicalize_snan1_value_f32: 1002; GFX11: ; %bb.0: 1003; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1004; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 1005; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1006; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1007; GFX11-NEXT: s_endpgm 1008; 1009; GFX12-LABEL: test_fold_canonicalize_snan1_value_f32: 1010; GFX12: ; %bb.0: 1011; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1012; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 1013; GFX12-NEXT: s_wait_kmcnt 0x0 1014; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 1015; GFX12-NEXT: s_endpgm 1016 %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2143289343 to float)) 1017 store float %canonicalized, ptr addrspace(1) %out 1018 ret void 1019} 1020 1021define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace(1) %out) #1 { 1022; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32: 1023; GFX678: ; %bb.0: 1024; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1025; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 1026; GFX678-NEXT: s_waitcnt lgkmcnt(0) 1027; GFX678-NEXT: v_mov_b32_e32 v0, s0 1028; GFX678-NEXT: v_mov_b32_e32 v1, s1 1029; GFX678-NEXT: flat_store_dword v[0:1], v2 1030; GFX678-NEXT: s_endpgm 1031; 1032; GFX9-LABEL: test_fold_canonicalize_snan2_value_f32: 1033; GFX9: ; %bb.0: 1034; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1035; GFX9-NEXT: v_mov_b32_e32 v0, 0 1036; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 1037; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1038; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1039; GFX9-NEXT: s_endpgm 1040; 1041; GFX11-LABEL: test_fold_canonicalize_snan2_value_f32: 1042; GFX11: ; %bb.0: 1043; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1044; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 1045; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1046; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1047; GFX11-NEXT: s_endpgm 1048; 1049; GFX12-LABEL: test_fold_canonicalize_snan2_value_f32: 1050; GFX12: ; %bb.0: 1051; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1052; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 1053; GFX12-NEXT: s_wait_kmcnt 0x0 1054; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 1055; GFX12-NEXT: s_endpgm 1056 %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4286578689 to float)) 1057 store float %canonicalized, ptr addrspace(1) %out 1058 ret void 1059} 1060 1061define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace(1) %out) #1 { 1062; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32: 1063; GFX678: ; %bb.0: 1064; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1065; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 1066; GFX678-NEXT: s_waitcnt lgkmcnt(0) 1067; GFX678-NEXT: v_mov_b32_e32 v0, s0 1068; GFX678-NEXT: v_mov_b32_e32 v1, s1 1069; GFX678-NEXT: flat_store_dword v[0:1], v2 1070; GFX678-NEXT: s_endpgm 1071; 1072; GFX9-LABEL: test_fold_canonicalize_snan3_value_f32: 1073; GFX9: ; %bb.0: 1074; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1075; GFX9-NEXT: v_mov_b32_e32 v0, 0 1076; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 1077; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1078; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1079; GFX9-NEXT: s_endpgm 1080; 1081; GFX11-LABEL: test_fold_canonicalize_snan3_value_f32: 1082; GFX11: ; %bb.0: 1083; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1084; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 1085; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1086; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1087; GFX11-NEXT: s_endpgm 1088; 1089; GFX12-LABEL: test_fold_canonicalize_snan3_value_f32: 1090; GFX12: ; %bb.0: 1091; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1092; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 1093; GFX12-NEXT: s_wait_kmcnt 0x0 1094; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 1095; GFX12-NEXT: s_endpgm 1096 %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4290772991 to float)) 1097 store float %canonicalized, ptr addrspace(1) %out 1098 ret void 1099} 1100 1101define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 { 1102; GFX678-LABEL: v_test_canonicalize_var_f64: 1103; GFX678: ; %bb.0: 1104; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1105; GFX678-NEXT: s_waitcnt lgkmcnt(0) 1106; GFX678-NEXT: v_mov_b32_e32 v0, s0 1107; GFX678-NEXT: v_mov_b32_e32 v1, s1 1108; GFX678-NEXT: flat_load_dwordx2 v[2:3], v[0:1] 1109; GFX678-NEXT: s_waitcnt vmcnt(0) 1110; GFX678-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 1111; GFX678-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1112; GFX678-NEXT: s_endpgm 1113; 1114; GFX9-LABEL: v_test_canonicalize_var_f64: 1115; GFX9: ; %bb.0: 1116; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1117; GFX9-NEXT: v_mov_b32_e32 v2, 0 1118; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1119; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] 1120; GFX9-NEXT: s_waitcnt vmcnt(0) 1121; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 1122; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1123; GFX9-NEXT: s_endpgm 1124; 1125; GFX11-LABEL: v_test_canonicalize_var_f64: 1126; GFX11: ; %bb.0: 1127; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1128; GFX11-NEXT: v_mov_b32_e32 v2, 0 1129; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1130; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] 1131; GFX11-NEXT: s_waitcnt vmcnt(0) 1132; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 1133; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1134; GFX11-NEXT: s_endpgm 1135; 1136; GFX12-LABEL: v_test_canonicalize_var_f64: 1137; GFX12: ; %bb.0: 1138; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1139; GFX12-NEXT: v_mov_b32_e32 v2, 0 1140; GFX12-NEXT: s_wait_kmcnt 0x0 1141; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] 1142; GFX12-NEXT: s_wait_loadcnt 0x0 1143; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] 1144; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1145; GFX12-NEXT: s_endpgm 1146 %val = load double, ptr addrspace(1) %out 1147 %canonicalized = call double @llvm.canonicalize.f64(double %val) 1148 store double %canonicalized, ptr addrspace(1) %out 1149 ret void 1150} 1151 1152define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, double %val) #1 { 1153; GFX6-LABEL: s_test_canonicalize_var_f64: 1154; GFX6: ; %bb.0: 1155; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1156; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1157; GFX6-NEXT: v_max_f64 v[2:3], s[2:3], s[2:3] 1158; GFX6-NEXT: v_mov_b32_e32 v0, s0 1159; GFX6-NEXT: v_mov_b32_e32 v1, s1 1160; GFX6-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1161; GFX6-NEXT: s_endpgm 1162; 1163; GFX8-LABEL: s_test_canonicalize_var_f64: 1164; GFX8: ; %bb.0: 1165; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1166; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1167; GFX8-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] 1168; GFX8-NEXT: v_mov_b32_e32 v2, s0 1169; GFX8-NEXT: v_mov_b32_e32 v3, s1 1170; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1171; GFX8-NEXT: s_endpgm 1172; 1173; GFX9-LABEL: s_test_canonicalize_var_f64: 1174; GFX9: ; %bb.0: 1175; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1176; GFX9-NEXT: v_mov_b32_e32 v2, 0 1177; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1178; GFX9-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] 1179; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1180; GFX9-NEXT: s_endpgm 1181; 1182; GFX11-LABEL: s_test_canonicalize_var_f64: 1183; GFX11: ; %bb.0: 1184; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1185; GFX11-NEXT: v_mov_b32_e32 v2, 0 1186; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1187; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] 1188; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1189; GFX11-NEXT: s_endpgm 1190; 1191; GFX12-LABEL: s_test_canonicalize_var_f64: 1192; GFX12: ; %bb.0: 1193; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1194; GFX12-NEXT: v_mov_b32_e32 v2, 0 1195; GFX12-NEXT: s_wait_kmcnt 0x0 1196; GFX12-NEXT: v_max_num_f64_e64 v[0:1], s[2:3], s[2:3] 1197; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1198; GFX12-NEXT: s_endpgm 1199 %canonicalized = call double @llvm.canonicalize.f64(double %val) 1200 store double %canonicalized, ptr addrspace(1) %out 1201 ret void 1202} 1203 1204define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %out) #1 { 1205; GFX678-LABEL: v_test_canonicalize_fabs_var_f64: 1206; GFX678: ; %bb.0: 1207; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1208; GFX678-NEXT: s_waitcnt lgkmcnt(0) 1209; GFX678-NEXT: v_mov_b32_e32 v0, s0 1210; GFX678-NEXT: v_mov_b32_e32 v1, s1 1211; GFX678-NEXT: flat_load_dwordx2 v[2:3], v[0:1] 1212; GFX678-NEXT: s_waitcnt vmcnt(0) 1213; GFX678-NEXT: v_max_f64 v[2:3], |v[2:3]|, |v[2:3]| 1214; GFX678-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1215; GFX678-NEXT: s_endpgm 1216; 1217; GFX9-LABEL: v_test_canonicalize_fabs_var_f64: 1218; GFX9: ; %bb.0: 1219; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1220; GFX9-NEXT: v_mov_b32_e32 v2, 0 1221; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1222; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] 1223; GFX9-NEXT: s_waitcnt vmcnt(0) 1224; GFX9-NEXT: v_max_f64 v[0:1], |v[0:1]|, |v[0:1]| 1225; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1226; GFX9-NEXT: s_endpgm 1227; 1228; GFX11-LABEL: v_test_canonicalize_fabs_var_f64: 1229; GFX11: ; %bb.0: 1230; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1231; GFX11-NEXT: v_mov_b32_e32 v2, 0 1232; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1233; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] 1234; GFX11-NEXT: s_waitcnt vmcnt(0) 1235; GFX11-NEXT: v_max_f64 v[0:1], |v[0:1]|, |v[0:1]| 1236; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1237; GFX11-NEXT: s_endpgm 1238; 1239; GFX12-LABEL: v_test_canonicalize_fabs_var_f64: 1240; GFX12: ; %bb.0: 1241; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1242; GFX12-NEXT: v_mov_b32_e32 v2, 0 1243; GFX12-NEXT: s_wait_kmcnt 0x0 1244; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] 1245; GFX12-NEXT: s_wait_loadcnt 0x0 1246; GFX12-NEXT: v_max_num_f64_e64 v[0:1], |v[0:1]|, |v[0:1]| 1247; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1248; GFX12-NEXT: s_endpgm 1249 %val = load double, ptr addrspace(1) %out 1250 %val.fabs = call double @llvm.fabs.f64(double %val) 1251 %canonicalized = call double @llvm.canonicalize.f64(double %val.fabs) 1252 store double %canonicalized, ptr addrspace(1) %out 1253 ret void 1254} 1255 1256define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1) %out) #1 { 1257; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64: 1258; GFX678: ; %bb.0: 1259; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1260; GFX678-NEXT: s_waitcnt lgkmcnt(0) 1261; GFX678-NEXT: v_mov_b32_e32 v0, s0 1262; GFX678-NEXT: v_mov_b32_e32 v1, s1 1263; GFX678-NEXT: flat_load_dwordx2 v[2:3], v[0:1] 1264; GFX678-NEXT: s_waitcnt vmcnt(0) 1265; GFX678-NEXT: v_max_f64 v[2:3], -|v[2:3]|, -|v[2:3]| 1266; GFX678-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1267; GFX678-NEXT: s_endpgm 1268; 1269; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f64: 1270; GFX9: ; %bb.0: 1271; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1272; GFX9-NEXT: v_mov_b32_e32 v2, 0 1273; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1274; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] 1275; GFX9-NEXT: s_waitcnt vmcnt(0) 1276; GFX9-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| 1277; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1278; GFX9-NEXT: s_endpgm 1279; 1280; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f64: 1281; GFX11: ; %bb.0: 1282; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1283; GFX11-NEXT: v_mov_b32_e32 v2, 0 1284; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1285; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] 1286; GFX11-NEXT: s_waitcnt vmcnt(0) 1287; GFX11-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| 1288; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1289; GFX11-NEXT: s_endpgm 1290; 1291; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f64: 1292; GFX12: ; %bb.0: 1293; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1294; GFX12-NEXT: v_mov_b32_e32 v2, 0 1295; GFX12-NEXT: s_wait_kmcnt 0x0 1296; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] 1297; GFX12-NEXT: s_wait_loadcnt 0x0 1298; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -|v[0:1]|, -|v[0:1]| 1299; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1300; GFX12-NEXT: s_endpgm 1301 %val = load double, ptr addrspace(1) %out 1302 %val.fabs = call double @llvm.fabs.f64(double %val) 1303 %val.fabs.fneg = fneg double %val.fabs 1304 %canonicalized = call double @llvm.canonicalize.f64(double %val.fabs.fneg) 1305 store double %canonicalized, ptr addrspace(1) %out 1306 ret void 1307} 1308 1309define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %out) #1 { 1310; GFX678-LABEL: v_test_canonicalize_fneg_var_f64: 1311; GFX678: ; %bb.0: 1312; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1313; GFX678-NEXT: s_waitcnt lgkmcnt(0) 1314; GFX678-NEXT: v_mov_b32_e32 v0, s0 1315; GFX678-NEXT: v_mov_b32_e32 v1, s1 1316; GFX678-NEXT: flat_load_dwordx2 v[2:3], v[0:1] 1317; GFX678-NEXT: s_waitcnt vmcnt(0) 1318; GFX678-NEXT: v_max_f64 v[2:3], -v[2:3], -v[2:3] 1319; GFX678-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1320; GFX678-NEXT: s_endpgm 1321; 1322; GFX9-LABEL: v_test_canonicalize_fneg_var_f64: 1323; GFX9: ; %bb.0: 1324; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1325; GFX9-NEXT: v_mov_b32_e32 v2, 0 1326; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1327; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] 1328; GFX9-NEXT: s_waitcnt vmcnt(0) 1329; GFX9-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] 1330; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1331; GFX9-NEXT: s_endpgm 1332; 1333; GFX11-LABEL: v_test_canonicalize_fneg_var_f64: 1334; GFX11: ; %bb.0: 1335; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1336; GFX11-NEXT: v_mov_b32_e32 v2, 0 1337; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1338; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] 1339; GFX11-NEXT: s_waitcnt vmcnt(0) 1340; GFX11-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] 1341; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1342; GFX11-NEXT: s_endpgm 1343; 1344; GFX12-LABEL: v_test_canonicalize_fneg_var_f64: 1345; GFX12: ; %bb.0: 1346; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1347; GFX12-NEXT: v_mov_b32_e32 v2, 0 1348; GFX12-NEXT: s_wait_kmcnt 0x0 1349; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] 1350; GFX12-NEXT: s_wait_loadcnt 0x0 1351; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -v[0:1], -v[0:1] 1352; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1353; GFX12-NEXT: s_endpgm 1354 %val = load double, ptr addrspace(1) %out 1355 %val.fneg = fneg double %val 1356 %canonicalized = call double @llvm.canonicalize.f64(double %val.fneg) 1357 store double %canonicalized, ptr addrspace(1) %out 1358 ret void 1359} 1360 1361define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) #1 { 1362; GFX678-LABEL: test_fold_canonicalize_p0_f64: 1363; GFX678: ; %bb.0: 1364; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1365; GFX678-NEXT: v_mov_b32_e32 v0, 0 1366; GFX678-NEXT: v_mov_b32_e32 v1, v0 1367; GFX678-NEXT: s_waitcnt lgkmcnt(0) 1368; GFX678-NEXT: v_mov_b32_e32 v3, s1 1369; GFX678-NEXT: v_mov_b32_e32 v2, s0 1370; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1371; GFX678-NEXT: s_endpgm 1372; 1373; GFX9-LABEL: test_fold_canonicalize_p0_f64: 1374; GFX9: ; %bb.0: 1375; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1376; GFX9-NEXT: v_mov_b32_e32 v0, 0 1377; GFX9-NEXT: v_mov_b32_e32 v1, v0 1378; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1379; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] 1380; GFX9-NEXT: s_endpgm 1381; 1382; GFX11-LABEL: test_fold_canonicalize_p0_f64: 1383; GFX11: ; %bb.0: 1384; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1385; GFX11-NEXT: v_mov_b32_e32 v0, 0 1386; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1387; GFX11-NEXT: v_mov_b32_e32 v1, v0 1388; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1389; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1390; GFX11-NEXT: s_endpgm 1391; 1392; GFX12-LABEL: test_fold_canonicalize_p0_f64: 1393; GFX12: ; %bb.0: 1394; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1395; GFX12-NEXT: v_mov_b32_e32 v0, 0 1396; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1397; GFX12-NEXT: v_mov_b32_e32 v1, v0 1398; GFX12-NEXT: s_wait_kmcnt 0x0 1399; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1400; GFX12-NEXT: s_endpgm 1401 %canonicalized = call double @llvm.canonicalize.f64(double 0.0) 1402 store double %canonicalized, ptr addrspace(1) %out 1403 ret void 1404} 1405 1406define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) #1 { 1407; GFX678-LABEL: test_fold_canonicalize_n0_f64: 1408; GFX678: ; %bb.0: 1409; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1410; GFX678-NEXT: v_mov_b32_e32 v0, 0 1411; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 1412; GFX678-NEXT: s_waitcnt lgkmcnt(0) 1413; GFX678-NEXT: v_mov_b32_e32 v3, s1 1414; GFX678-NEXT: v_mov_b32_e32 v2, s0 1415; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1416; GFX678-NEXT: s_endpgm 1417; 1418; GFX9-LABEL: test_fold_canonicalize_n0_f64: 1419; GFX9: ; %bb.0: 1420; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1421; GFX9-NEXT: v_mov_b32_e32 v0, 0 1422; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1423; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1424; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] 1425; GFX9-NEXT: s_endpgm 1426; 1427; GFX11-LABEL: test_fold_canonicalize_n0_f64: 1428; GFX11: ; %bb.0: 1429; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1430; GFX11-NEXT: v_mov_b32_e32 v0, 0 1431; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 1432; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1433; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1434; GFX11-NEXT: s_endpgm 1435; 1436; GFX12-LABEL: test_fold_canonicalize_n0_f64: 1437; GFX12: ; %bb.0: 1438; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1439; GFX12-NEXT: v_mov_b32_e32 v0, 0 1440; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 1441; GFX12-NEXT: s_wait_kmcnt 0x0 1442; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1443; GFX12-NEXT: s_endpgm 1444 %canonicalized = call double @llvm.canonicalize.f64(double -0.0) 1445 store double %canonicalized, ptr addrspace(1) %out 1446 ret void 1447} 1448 1449define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) #1 { 1450; GFX678-LABEL: test_fold_canonicalize_p1_f64: 1451; GFX678: ; %bb.0: 1452; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1453; GFX678-NEXT: v_mov_b32_e32 v0, 0 1454; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000 1455; GFX678-NEXT: s_waitcnt lgkmcnt(0) 1456; GFX678-NEXT: v_mov_b32_e32 v3, s1 1457; GFX678-NEXT: v_mov_b32_e32 v2, s0 1458; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1459; GFX678-NEXT: s_endpgm 1460; 1461; GFX9-LABEL: test_fold_canonicalize_p1_f64: 1462; GFX9: ; %bb.0: 1463; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1464; GFX9-NEXT: v_mov_b32_e32 v0, 0 1465; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff00000 1466; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1467; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] 1468; GFX9-NEXT: s_endpgm 1469; 1470; GFX11-LABEL: test_fold_canonicalize_p1_f64: 1471; GFX11: ; %bb.0: 1472; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1473; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 1474; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1475; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1476; GFX11-NEXT: s_endpgm 1477; 1478; GFX12-LABEL: test_fold_canonicalize_p1_f64: 1479; GFX12: ; %bb.0: 1480; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1481; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 1482; GFX12-NEXT: s_wait_kmcnt 0x0 1483; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1484; GFX12-NEXT: s_endpgm 1485 %canonicalized = call double @llvm.canonicalize.f64(double 1.0) 1486 store double %canonicalized, ptr addrspace(1) %out 1487 ret void 1488} 1489 1490define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) #1 { 1491; GFX678-LABEL: test_fold_canonicalize_n1_f64: 1492; GFX678: ; %bb.0: 1493; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1494; GFX678-NEXT: v_mov_b32_e32 v0, 0 1495; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000 1496; GFX678-NEXT: s_waitcnt lgkmcnt(0) 1497; GFX678-NEXT: v_mov_b32_e32 v3, s1 1498; GFX678-NEXT: v_mov_b32_e32 v2, s0 1499; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1500; GFX678-NEXT: s_endpgm 1501; 1502; GFX9-LABEL: test_fold_canonicalize_n1_f64: 1503; GFX9: ; %bb.0: 1504; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1505; GFX9-NEXT: v_mov_b32_e32 v0, 0 1506; GFX9-NEXT: v_mov_b32_e32 v1, 0xbff00000 1507; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1508; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] 1509; GFX9-NEXT: s_endpgm 1510; 1511; GFX11-LABEL: test_fold_canonicalize_n1_f64: 1512; GFX11: ; %bb.0: 1513; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1514; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000 1515; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1516; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1517; GFX11-NEXT: s_endpgm 1518; 1519; GFX12-LABEL: test_fold_canonicalize_n1_f64: 1520; GFX12: ; %bb.0: 1521; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1522; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000 1523; GFX12-NEXT: s_wait_kmcnt 0x0 1524; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1525; GFX12-NEXT: s_endpgm 1526 %canonicalized = call double @llvm.canonicalize.f64(double -1.0) 1527 store double %canonicalized, ptr addrspace(1) %out 1528 ret void 1529} 1530 1531define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %out) #1 { 1532; GFX678-LABEL: test_fold_canonicalize_literal_f64: 1533; GFX678: ; %bb.0: 1534; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1535; GFX678-NEXT: v_mov_b32_e32 v0, 0 1536; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000 1537; GFX678-NEXT: s_waitcnt lgkmcnt(0) 1538; GFX678-NEXT: v_mov_b32_e32 v3, s1 1539; GFX678-NEXT: v_mov_b32_e32 v2, s0 1540; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1541; GFX678-NEXT: s_endpgm 1542; 1543; GFX9-LABEL: test_fold_canonicalize_literal_f64: 1544; GFX9: ; %bb.0: 1545; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1546; GFX9-NEXT: v_mov_b32_e32 v0, 0 1547; GFX9-NEXT: v_mov_b32_e32 v1, 0x40300000 1548; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1549; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] 1550; GFX9-NEXT: s_endpgm 1551; 1552; GFX11-LABEL: test_fold_canonicalize_literal_f64: 1553; GFX11: ; %bb.0: 1554; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1555; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000 1556; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1557; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1558; GFX11-NEXT: s_endpgm 1559; 1560; GFX12-LABEL: test_fold_canonicalize_literal_f64: 1561; GFX12: ; %bb.0: 1562; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1563; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000 1564; GFX12-NEXT: s_wait_kmcnt 0x0 1565; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1566; GFX12-NEXT: s_endpgm 1567 %canonicalized = call double @llvm.canonicalize.f64(double 16.0) 1568 store double %canonicalized, ptr addrspace(1) %out 1569 ret void 1570} 1571 1572define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr addrspace(1) %out) #2 { 1573; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: 1574; GFX678: ; %bb.0: 1575; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1576; GFX678-NEXT: v_mov_b32_e32 v0, 0 1577; GFX678-NEXT: v_mov_b32_e32 v1, v0 1578; GFX678-NEXT: s_waitcnt lgkmcnt(0) 1579; GFX678-NEXT: v_mov_b32_e32 v3, s1 1580; GFX678-NEXT: v_mov_b32_e32 v2, s0 1581; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1582; GFX678-NEXT: s_endpgm 1583; 1584; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: 1585; GFX9: ; %bb.0: 1586; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1587; GFX9-NEXT: v_mov_b32_e32 v0, 0 1588; GFX9-NEXT: v_mov_b32_e32 v1, v0 1589; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1590; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] 1591; GFX9-NEXT: s_endpgm 1592; 1593; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: 1594; GFX11: ; %bb.0: 1595; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1596; GFX11-NEXT: v_mov_b32_e32 v0, 0 1597; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1598; GFX11-NEXT: v_mov_b32_e32 v1, v0 1599; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1600; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1601; GFX11-NEXT: s_endpgm 1602; 1603; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: 1604; GFX12: ; %bb.0: 1605; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1606; GFX12-NEXT: v_mov_b32_e32 v0, 0 1607; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1608; GFX12-NEXT: v_mov_b32_e32 v1, v0 1609; GFX12-NEXT: s_wait_kmcnt 0x0 1610; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1611; GFX12-NEXT: s_endpgm 1612 %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double)) 1613 store double %canonicalized, ptr addrspace(1) %out 1614 ret void 1615} 1616 1617define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr addrspace(1) %out) #3 { 1618; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64: 1619; GFX678: ; %bb.0: 1620; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1621; GFX678-NEXT: v_mov_b32_e32 v0, -1 1622; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff 1623; GFX678-NEXT: s_waitcnt lgkmcnt(0) 1624; GFX678-NEXT: v_mov_b32_e32 v3, s1 1625; GFX678-NEXT: v_mov_b32_e32 v2, s0 1626; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1627; GFX678-NEXT: s_endpgm 1628; 1629; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f64: 1630; GFX9: ; %bb.0: 1631; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1632; GFX9-NEXT: v_mov_b32_e32 v2, 0 1633; GFX9-NEXT: v_mov_b32_e32 v0, -1 1634; GFX9-NEXT: v_mov_b32_e32 v1, 0xfffff 1635; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1636; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1637; GFX9-NEXT: s_endpgm 1638; 1639; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f64: 1640; GFX11: ; %bb.0: 1641; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1642; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0xfffff 1643; GFX11-NEXT: v_mov_b32_e32 v0, -1 1644; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1645; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1646; GFX11-NEXT: s_endpgm 1647; 1648; GFX12-LABEL: test_denormals_fold_canonicalize_denormal0_f64: 1649; GFX12: ; %bb.0: 1650; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1651; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0xfffff 1652; GFX12-NEXT: v_mov_b32_e32 v0, -1 1653; GFX12-NEXT: s_wait_kmcnt 0x0 1654; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1655; GFX12-NEXT: s_endpgm 1656 %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double)) 1657 store double %canonicalized, ptr addrspace(1) %out 1658 ret void 1659} 1660 1661define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #2 { 1662; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: 1663; GFX678: ; %bb.0: 1664; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1665; GFX678-NEXT: v_mov_b32_e32 v0, 0 1666; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 1667; GFX678-NEXT: s_waitcnt lgkmcnt(0) 1668; GFX678-NEXT: v_mov_b32_e32 v3, s1 1669; GFX678-NEXT: v_mov_b32_e32 v2, s0 1670; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1671; GFX678-NEXT: s_endpgm 1672; 1673; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: 1674; GFX9: ; %bb.0: 1675; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1676; GFX9-NEXT: v_mov_b32_e32 v0, 0 1677; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1678; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1679; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] 1680; GFX9-NEXT: s_endpgm 1681; 1682; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: 1683; GFX11: ; %bb.0: 1684; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1685; GFX11-NEXT: v_mov_b32_e32 v0, 0 1686; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 1687; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1688; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1689; GFX11-NEXT: s_endpgm 1690; 1691; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: 1692; GFX12: ; %bb.0: 1693; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1694; GFX12-NEXT: v_mov_b32_e32 v0, 0 1695; GFX12-NEXT: v_bfrev_b32_e32 v1, 1 1696; GFX12-NEXT: s_wait_kmcnt 0x0 1697; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1698; GFX12-NEXT: s_endpgm 1699 %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double)) 1700 store double %canonicalized, ptr addrspace(1) %out 1701 ret void 1702} 1703 1704define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr addrspace(1) %out) #3 { 1705; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64: 1706; GFX678: ; %bb.0: 1707; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1708; GFX678-NEXT: v_mov_b32_e32 v0, -1 1709; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff 1710; GFX678-NEXT: s_waitcnt lgkmcnt(0) 1711; GFX678-NEXT: v_mov_b32_e32 v3, s1 1712; GFX678-NEXT: v_mov_b32_e32 v2, s0 1713; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1714; GFX678-NEXT: s_endpgm 1715; 1716; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f64: 1717; GFX9: ; %bb.0: 1718; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1719; GFX9-NEXT: v_mov_b32_e32 v2, 0 1720; GFX9-NEXT: v_mov_b32_e32 v0, -1 1721; GFX9-NEXT: v_mov_b32_e32 v1, 0x800fffff 1722; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1723; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1724; GFX9-NEXT: s_endpgm 1725; 1726; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f64: 1727; GFX11: ; %bb.0: 1728; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1729; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x800fffff 1730; GFX11-NEXT: v_mov_b32_e32 v0, -1 1731; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1732; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1733; GFX11-NEXT: s_endpgm 1734; 1735; GFX12-LABEL: test_denormals_fold_canonicalize_denormal1_f64: 1736; GFX12: ; %bb.0: 1737; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1738; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x800fffff 1739; GFX12-NEXT: v_mov_b32_e32 v0, -1 1740; GFX12-NEXT: s_wait_kmcnt 0x0 1741; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1742; GFX12-NEXT: s_endpgm 1743 %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double)) 1744 store double %canonicalized, ptr addrspace(1) %out 1745 ret void 1746} 1747 1748define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out) #1 { 1749; GFX678-LABEL: test_fold_canonicalize_qnan_f64: 1750; GFX678: ; %bb.0: 1751; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1752; GFX678-NEXT: v_mov_b32_e32 v0, 0 1753; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 1754; GFX678-NEXT: s_waitcnt lgkmcnt(0) 1755; GFX678-NEXT: v_mov_b32_e32 v3, s1 1756; GFX678-NEXT: v_mov_b32_e32 v2, s0 1757; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1758; GFX678-NEXT: s_endpgm 1759; 1760; GFX9-LABEL: test_fold_canonicalize_qnan_f64: 1761; GFX9: ; %bb.0: 1762; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1763; GFX9-NEXT: v_mov_b32_e32 v0, 0 1764; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 1765; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1766; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] 1767; GFX9-NEXT: s_endpgm 1768; 1769; GFX11-LABEL: test_fold_canonicalize_qnan_f64: 1770; GFX11: ; %bb.0: 1771; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1772; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 1773; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1774; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1775; GFX11-NEXT: s_endpgm 1776; 1777; GFX12-LABEL: test_fold_canonicalize_qnan_f64: 1778; GFX12: ; %bb.0: 1779; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1780; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 1781; GFX12-NEXT: s_wait_kmcnt 0x0 1782; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1783; GFX12-NEXT: s_endpgm 1784 %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000) 1785 store double %canonicalized, ptr addrspace(1) %out 1786 ret void 1787} 1788 1789define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrspace(1) %out) #1 { 1790; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: 1791; GFX678: ; %bb.0: 1792; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1793; GFX678-NEXT: v_mov_b32_e32 v0, 0 1794; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 1795; GFX678-NEXT: s_waitcnt lgkmcnt(0) 1796; GFX678-NEXT: v_mov_b32_e32 v3, s1 1797; GFX678-NEXT: v_mov_b32_e32 v2, s0 1798; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1799; GFX678-NEXT: s_endpgm 1800; 1801; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: 1802; GFX9: ; %bb.0: 1803; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1804; GFX9-NEXT: v_mov_b32_e32 v0, 0 1805; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 1806; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1807; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] 1808; GFX9-NEXT: s_endpgm 1809; 1810; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: 1811; GFX11: ; %bb.0: 1812; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1813; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 1814; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1815; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1816; GFX11-NEXT: s_endpgm 1817; 1818; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: 1819; GFX12: ; %bb.0: 1820; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1821; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 1822; GFX12-NEXT: s_wait_kmcnt 0x0 1823; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1824; GFX12-NEXT: s_endpgm 1825 %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double)) 1826 store double %canonicalized, ptr addrspace(1) %out 1827 ret void 1828} 1829 1830define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrspace(1) %out) #1 { 1831; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: 1832; GFX678: ; %bb.0: 1833; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1834; GFX678-NEXT: v_mov_b32_e32 v0, 0 1835; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 1836; GFX678-NEXT: s_waitcnt lgkmcnt(0) 1837; GFX678-NEXT: v_mov_b32_e32 v3, s1 1838; GFX678-NEXT: v_mov_b32_e32 v2, s0 1839; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1840; GFX678-NEXT: s_endpgm 1841; 1842; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: 1843; GFX9: ; %bb.0: 1844; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1845; GFX9-NEXT: v_mov_b32_e32 v0, 0 1846; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 1847; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1848; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] 1849; GFX9-NEXT: s_endpgm 1850; 1851; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: 1852; GFX11: ; %bb.0: 1853; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1854; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 1855; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1856; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1857; GFX11-NEXT: s_endpgm 1858; 1859; GFX12-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: 1860; GFX12: ; %bb.0: 1861; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1862; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 1863; GFX12-NEXT: s_wait_kmcnt 0x0 1864; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1865; GFX12-NEXT: s_endpgm 1866 %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double)) 1867 store double %canonicalized, ptr addrspace(1) %out 1868 ret void 1869} 1870 1871define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(1) %out) #1 { 1872; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64: 1873; GFX678: ; %bb.0: 1874; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1875; GFX678-NEXT: v_mov_b32_e32 v0, 0 1876; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 1877; GFX678-NEXT: s_waitcnt lgkmcnt(0) 1878; GFX678-NEXT: v_mov_b32_e32 v3, s1 1879; GFX678-NEXT: v_mov_b32_e32 v2, s0 1880; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1881; GFX678-NEXT: s_endpgm 1882; 1883; GFX9-LABEL: test_fold_canonicalize_snan0_value_f64: 1884; GFX9: ; %bb.0: 1885; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1886; GFX9-NEXT: v_mov_b32_e32 v0, 0 1887; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 1888; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1889; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] 1890; GFX9-NEXT: s_endpgm 1891; 1892; GFX11-LABEL: test_fold_canonicalize_snan0_value_f64: 1893; GFX11: ; %bb.0: 1894; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1895; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 1896; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1897; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1898; GFX11-NEXT: s_endpgm 1899; 1900; GFX12-LABEL: test_fold_canonicalize_snan0_value_f64: 1901; GFX12: ; %bb.0: 1902; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1903; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 1904; GFX12-NEXT: s_wait_kmcnt 0x0 1905; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1906; GFX12-NEXT: s_endpgm 1907 %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double)) 1908 store double %canonicalized, ptr addrspace(1) %out 1909 ret void 1910} 1911 1912define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(1) %out) #1 { 1913; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64: 1914; GFX678: ; %bb.0: 1915; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1916; GFX678-NEXT: v_mov_b32_e32 v0, 0 1917; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 1918; GFX678-NEXT: s_waitcnt lgkmcnt(0) 1919; GFX678-NEXT: v_mov_b32_e32 v3, s1 1920; GFX678-NEXT: v_mov_b32_e32 v2, s0 1921; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1922; GFX678-NEXT: s_endpgm 1923; 1924; GFX9-LABEL: test_fold_canonicalize_snan1_value_f64: 1925; GFX9: ; %bb.0: 1926; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1927; GFX9-NEXT: v_mov_b32_e32 v0, 0 1928; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 1929; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1930; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] 1931; GFX9-NEXT: s_endpgm 1932; 1933; GFX11-LABEL: test_fold_canonicalize_snan1_value_f64: 1934; GFX11: ; %bb.0: 1935; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1936; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 1937; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1938; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1939; GFX11-NEXT: s_endpgm 1940; 1941; GFX12-LABEL: test_fold_canonicalize_snan1_value_f64: 1942; GFX12: ; %bb.0: 1943; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1944; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 1945; GFX12-NEXT: s_wait_kmcnt 0x0 1946; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1947; GFX12-NEXT: s_endpgm 1948 %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9223372036854775807 to double)) 1949 store double %canonicalized, ptr addrspace(1) %out 1950 ret void 1951} 1952 1953define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(1) %out) #1 { 1954; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64: 1955; GFX678: ; %bb.0: 1956; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1957; GFX678-NEXT: v_mov_b32_e32 v0, 0 1958; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 1959; GFX678-NEXT: s_waitcnt lgkmcnt(0) 1960; GFX678-NEXT: v_mov_b32_e32 v3, s1 1961; GFX678-NEXT: v_mov_b32_e32 v2, s0 1962; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1963; GFX678-NEXT: s_endpgm 1964; 1965; GFX9-LABEL: test_fold_canonicalize_snan2_value_f64: 1966; GFX9: ; %bb.0: 1967; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1968; GFX9-NEXT: v_mov_b32_e32 v0, 0 1969; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 1970; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1971; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] 1972; GFX9-NEXT: s_endpgm 1973; 1974; GFX11-LABEL: test_fold_canonicalize_snan2_value_f64: 1975; GFX11: ; %bb.0: 1976; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1977; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 1978; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1979; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1980; GFX11-NEXT: s_endpgm 1981; 1982; GFX12-LABEL: test_fold_canonicalize_snan2_value_f64: 1983; GFX12: ; %bb.0: 1984; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1985; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 1986; GFX12-NEXT: s_wait_kmcnt 0x0 1987; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] 1988; GFX12-NEXT: s_endpgm 1989 %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18442240474082181121 to double)) 1990 store double %canonicalized, ptr addrspace(1) %out 1991 ret void 1992} 1993 1994define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(1) %out) #1 { 1995; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64: 1996; GFX678: ; %bb.0: 1997; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1998; GFX678-NEXT: v_mov_b32_e32 v0, 0 1999; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 2000; GFX678-NEXT: s_waitcnt lgkmcnt(0) 2001; GFX678-NEXT: v_mov_b32_e32 v3, s1 2002; GFX678-NEXT: v_mov_b32_e32 v2, s0 2003; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2004; GFX678-NEXT: s_endpgm 2005; 2006; GFX9-LABEL: test_fold_canonicalize_snan3_value_f64: 2007; GFX9: ; %bb.0: 2008; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 2009; GFX9-NEXT: v_mov_b32_e32 v0, 0 2010; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 2011; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2012; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] 2013; GFX9-NEXT: s_endpgm 2014; 2015; GFX11-LABEL: test_fold_canonicalize_snan3_value_f64: 2016; GFX11: ; %bb.0: 2017; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2018; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 2019; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2020; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 2021; GFX11-NEXT: s_endpgm 2022; 2023; GFX12-LABEL: test_fold_canonicalize_snan3_value_f64: 2024; GFX12: ; %bb.0: 2025; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2026; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 2027; GFX12-NEXT: s_wait_kmcnt 0x0 2028; GFX12-NEXT: global_store_b64 v0, v[0:1], s[0:1] 2029; GFX12-NEXT: s_endpgm 2030 %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18446744073709551615 to double)) 2031 store double %canonicalized, ptr addrspace(1) %out 2032 ret void 2033} 2034 2035define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { 2036; GFX6-LABEL: test_canonicalize_value_f64_flush: 2037; GFX6: ; %bb.0: 2038; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2039; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 2040; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2041; GFX6-NEXT: v_mov_b32_e32 v1, s1 2042; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 2043; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2044; GFX6-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 2045; GFX6-NEXT: v_mov_b32_e32 v3, s3 2046; GFX6-NEXT: v_add_i32_e32 v2, vcc, s2, v2 2047; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2048; GFX6-NEXT: s_waitcnt vmcnt(0) 2049; GFX6-NEXT: v_mul_f64 v[0:1], 1.0, v[0:1] 2050; GFX6-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2051; GFX6-NEXT: s_endpgm 2052; 2053; GFX8-LABEL: test_canonicalize_value_f64_flush: 2054; GFX8: ; %bb.0: 2055; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2056; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 2057; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2058; GFX8-NEXT: v_mov_b32_e32 v1, s1 2059; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2060; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2061; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 2062; GFX8-NEXT: v_mov_b32_e32 v3, s3 2063; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 2064; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2065; GFX8-NEXT: s_waitcnt vmcnt(0) 2066; GFX8-NEXT: v_mul_f64 v[0:1], 1.0, v[0:1] 2067; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2068; GFX8-NEXT: s_endpgm 2069; 2070; GFX9-LABEL: test_canonicalize_value_f64_flush: 2071; GFX9: ; %bb.0: 2072; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2073; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 2074; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2075; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] 2076; GFX9-NEXT: s_waitcnt vmcnt(0) 2077; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 2078; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 2079; GFX9-NEXT: s_endpgm 2080; 2081; GFX11-LABEL: test_canonicalize_value_f64_flush: 2082; GFX11: ; %bb.0: 2083; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2084; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2085; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2086; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 2087; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2088; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] 2089; GFX11-NEXT: s_waitcnt vmcnt(0) 2090; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 2091; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] 2092; GFX11-NEXT: s_endpgm 2093; 2094; GFX12-LABEL: test_canonicalize_value_f64_flush: 2095; GFX12: ; %bb.0: 2096; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2097; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2098; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2099; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 2100; GFX12-NEXT: s_wait_kmcnt 0x0 2101; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] 2102; GFX12-NEXT: s_wait_loadcnt 0x0 2103; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] 2104; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] 2105; GFX12-NEXT: s_endpgm 2106 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 2107 %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id 2108 %v = load double, ptr addrspace(1) %gep, align 8 2109 %canonicalized = tail call double @llvm.canonicalize.f64(double %v) 2110 %gep2 = getelementptr inbounds double, ptr addrspace(1) %out, i32 %id 2111 store double %canonicalized, ptr addrspace(1) %gep2, align 8 2112 ret void 2113} 2114 2115define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { 2116; GFX6-LABEL: test_canonicalize_value_f32_flush: 2117; GFX6: ; %bb.0: 2118; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2119; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2120; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2121; GFX6-NEXT: v_mov_b32_e32 v1, s1 2122; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 2123; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2124; GFX6-NEXT: flat_load_dword v0, v[0:1] 2125; GFX6-NEXT: v_mov_b32_e32 v1, s3 2126; GFX6-NEXT: s_waitcnt vmcnt(0) 2127; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 2128; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 2129; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2130; GFX6-NEXT: flat_store_dword v[0:1], v3 2131; GFX6-NEXT: s_endpgm 2132; 2133; GFX8-LABEL: test_canonicalize_value_f32_flush: 2134; GFX8: ; %bb.0: 2135; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2136; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2137; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2138; GFX8-NEXT: v_mov_b32_e32 v1, s1 2139; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2140; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2141; GFX8-NEXT: flat_load_dword v0, v[0:1] 2142; GFX8-NEXT: v_mov_b32_e32 v1, s3 2143; GFX8-NEXT: s_waitcnt vmcnt(0) 2144; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 2145; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2146; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2147; GFX8-NEXT: flat_store_dword v[0:1], v3 2148; GFX8-NEXT: s_endpgm 2149; 2150; GFX9-LABEL: test_canonicalize_value_f32_flush: 2151; GFX9: ; %bb.0: 2152; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2153; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2154; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2155; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 2156; GFX9-NEXT: s_waitcnt vmcnt(0) 2157; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 2158; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 2159; GFX9-NEXT: s_endpgm 2160; 2161; GFX11-LABEL: test_canonicalize_value_f32_flush: 2162; GFX11: ; %bb.0: 2163; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2164; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2165; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2166; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2167; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2168; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] 2169; GFX11-NEXT: s_waitcnt vmcnt(0) 2170; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 2171; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] 2172; GFX11-NEXT: s_endpgm 2173; 2174; GFX12-LABEL: test_canonicalize_value_f32_flush: 2175; GFX12: ; %bb.0: 2176; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2177; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2178; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2179; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2180; GFX12-NEXT: s_wait_kmcnt 0x0 2181; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] 2182; GFX12-NEXT: s_wait_loadcnt 0x0 2183; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 2184; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] 2185; GFX12-NEXT: s_endpgm 2186 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 2187 %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id 2188 %v = load float, ptr addrspace(1) %gep, align 4 2189 %canonicalized = tail call float @llvm.canonicalize.f32(float %v) 2190 %gep2 = getelementptr inbounds float, ptr addrspace(1) %out, i32 %id 2191 store float %canonicalized, ptr addrspace(1) %gep2, align 4 2192 ret void 2193} 2194 2195define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { 2196; GFX6-LABEL: test_canonicalize_value_f16_flush: 2197; GFX6: ; %bb.0: 2198; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2199; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 2200; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2201; GFX6-NEXT: v_mov_b32_e32 v1, s1 2202; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 2203; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2204; GFX6-NEXT: flat_load_ushort v0, v[0:1] 2205; GFX6-NEXT: v_mov_b32_e32 v1, s3 2206; GFX6-NEXT: s_waitcnt vmcnt(0) 2207; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 2208; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v0 2209; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 2210; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2211; GFX6-NEXT: flat_store_short v[0:1], v3 2212; GFX6-NEXT: s_endpgm 2213; 2214; GFX8-LABEL: test_canonicalize_value_f16_flush: 2215; GFX8: ; %bb.0: 2216; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2217; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 2218; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2219; GFX8-NEXT: v_mov_b32_e32 v1, s1 2220; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2221; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2222; GFX8-NEXT: flat_load_ushort v0, v[0:1] 2223; GFX8-NEXT: v_mov_b32_e32 v1, s3 2224; GFX8-NEXT: s_waitcnt vmcnt(0) 2225; GFX8-NEXT: v_mul_f16_e32 v3, 1.0, v0 2226; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2227; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2228; GFX8-NEXT: flat_store_short v[0:1], v3 2229; GFX8-NEXT: s_endpgm 2230; 2231; GFX9-LABEL: test_canonicalize_value_f16_flush: 2232; GFX9: ; %bb.0: 2233; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2234; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2235; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2236; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] 2237; GFX9-NEXT: s_waitcnt vmcnt(0) 2238; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 2239; GFX9-NEXT: global_store_short v0, v1, s[2:3] 2240; GFX9-NEXT: s_endpgm 2241; 2242; GFX11-LABEL: test_canonicalize_value_f16_flush: 2243; GFX11: ; %bb.0: 2244; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2245; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2246; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2247; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2248; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2249; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] 2250; GFX11-NEXT: s_waitcnt vmcnt(0) 2251; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 2252; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] 2253; GFX11-NEXT: s_endpgm 2254; 2255; GFX12-LABEL: test_canonicalize_value_f16_flush: 2256; GFX12: ; %bb.0: 2257; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2258; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2259; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2260; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2261; GFX12-NEXT: s_wait_kmcnt 0x0 2262; GFX12-NEXT: global_load_u16 v1, v0, s[0:1] 2263; GFX12-NEXT: s_wait_loadcnt 0x0 2264; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 2265; GFX12-NEXT: global_store_b16 v0, v1, s[2:3] 2266; GFX12-NEXT: s_endpgm 2267 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 2268 %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id 2269 %v = load half, ptr addrspace(1) %gep, align 2 2270 %canonicalized = tail call half @llvm.canonicalize.f16(half %v) 2271 %gep2 = getelementptr inbounds half, ptr addrspace(1) %out, i32 %id 2272 store half %canonicalized, ptr addrspace(1) %gep2, align 2 2273 ret void 2274} 2275 2276 2277define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) %arg, ptr addrspace(1) %out) #4 { 2278; GFX6-LABEL: test_canonicalize_value_v2f16_flush: 2279; GFX6: ; %bb.0: 2280; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2281; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2282; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2283; GFX6-NEXT: v_mov_b32_e32 v1, s1 2284; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 2285; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2286; GFX6-NEXT: flat_load_dword v0, v[0:1] 2287; GFX6-NEXT: v_mov_b32_e32 v3, s3 2288; GFX6-NEXT: s_waitcnt vmcnt(0) 2289; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 2290; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 2291; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 2292; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 2293; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 2294; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2295; GFX6-NEXT: v_or_b32_e32 v4, v0, v1 2296; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 2297; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 2298; GFX6-NEXT: flat_store_dword v[0:1], v4 2299; GFX6-NEXT: s_endpgm 2300; 2301; GFX8-LABEL: test_canonicalize_value_v2f16_flush: 2302; GFX8: ; %bb.0: 2303; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2304; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2305; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2306; GFX8-NEXT: v_mov_b32_e32 v1, s1 2307; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2308; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2309; GFX8-NEXT: flat_load_dword v0, v[0:1] 2310; GFX8-NEXT: v_mov_b32_e32 v1, 0x3c00 2311; GFX8-NEXT: v_mov_b32_e32 v3, s3 2312; GFX8-NEXT: s_waitcnt vmcnt(0) 2313; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2314; GFX8-NEXT: v_mul_f16_e32 v0, 1.0, v0 2315; GFX8-NEXT: v_or_b32_e32 v4, v0, v1 2316; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2317; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 2318; GFX8-NEXT: flat_store_dword v[0:1], v4 2319; GFX8-NEXT: s_endpgm 2320; 2321; GFX9-LABEL: test_canonicalize_value_v2f16_flush: 2322; GFX9: ; %bb.0: 2323; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2324; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2325; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2326; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 2327; GFX9-NEXT: s_waitcnt vmcnt(0) 2328; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 2329; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 2330; GFX9-NEXT: s_endpgm 2331; 2332; GFX11-LABEL: test_canonicalize_value_v2f16_flush: 2333; GFX11: ; %bb.0: 2334; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2335; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2336; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2337; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2338; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2339; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] 2340; GFX11-NEXT: s_waitcnt vmcnt(0) 2341; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 2342; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] 2343; GFX11-NEXT: s_endpgm 2344; 2345; GFX12-LABEL: test_canonicalize_value_v2f16_flush: 2346; GFX12: ; %bb.0: 2347; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2348; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2349; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2350; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2351; GFX12-NEXT: s_wait_kmcnt 0x0 2352; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] 2353; GFX12-NEXT: s_wait_loadcnt 0x0 2354; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 2355; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] 2356; GFX12-NEXT: s_endpgm 2357 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 2358 %gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i32 %id 2359 %v = load <2 x half>, ptr addrspace(1) %gep, align 4 2360 %canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v) 2361 %gep2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i32 %id 2362 store <2 x half> %canonicalized, ptr addrspace(1) %gep2, align 2 2363 ret void 2364} 2365 2366define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { 2367; GFX6-LABEL: test_canonicalize_value_f64_denorm: 2368; GFX6: ; %bb.0: 2369; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2370; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 2371; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2372; GFX6-NEXT: v_mov_b32_e32 v1, s1 2373; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 2374; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2375; GFX6-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 2376; GFX6-NEXT: v_mov_b32_e32 v3, s3 2377; GFX6-NEXT: v_add_i32_e32 v2, vcc, s2, v2 2378; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2379; GFX6-NEXT: s_waitcnt vmcnt(0) 2380; GFX6-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 2381; GFX6-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2382; GFX6-NEXT: s_endpgm 2383; 2384; GFX8-LABEL: test_canonicalize_value_f64_denorm: 2385; GFX8: ; %bb.0: 2386; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2387; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 2388; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2389; GFX8-NEXT: v_mov_b32_e32 v1, s1 2390; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2391; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2392; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 2393; GFX8-NEXT: v_mov_b32_e32 v3, s3 2394; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 2395; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2396; GFX8-NEXT: s_waitcnt vmcnt(0) 2397; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 2398; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2399; GFX8-NEXT: s_endpgm 2400; 2401; GFX9-LABEL: test_canonicalize_value_f64_denorm: 2402; GFX9: ; %bb.0: 2403; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2404; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 2405; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2406; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] 2407; GFX9-NEXT: s_waitcnt vmcnt(0) 2408; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 2409; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 2410; GFX9-NEXT: s_endpgm 2411; 2412; GFX11-LABEL: test_canonicalize_value_f64_denorm: 2413; GFX11: ; %bb.0: 2414; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2415; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2416; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2417; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 2418; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2419; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] 2420; GFX11-NEXT: s_waitcnt vmcnt(0) 2421; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 2422; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] 2423; GFX11-NEXT: s_endpgm 2424; 2425; GFX12-LABEL: test_canonicalize_value_f64_denorm: 2426; GFX12: ; %bb.0: 2427; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2428; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2429; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2430; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 2431; GFX12-NEXT: s_wait_kmcnt 0x0 2432; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] 2433; GFX12-NEXT: s_wait_loadcnt 0x0 2434; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] 2435; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] 2436; GFX12-NEXT: s_endpgm 2437 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 2438 %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id 2439 %v = load double, ptr addrspace(1) %gep, align 8 2440 %canonicalized = tail call double @llvm.canonicalize.f64(double %v) 2441 %gep2 = getelementptr inbounds double, ptr addrspace(1) %out, i32 %id 2442 store double %canonicalized, ptr addrspace(1) %gep2, align 8 2443 ret void 2444} 2445 2446define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { 2447; GFX6-LABEL: test_canonicalize_value_f32_denorm: 2448; GFX6: ; %bb.0: 2449; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2450; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2451; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2452; GFX6-NEXT: v_mov_b32_e32 v1, s1 2453; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 2454; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2455; GFX6-NEXT: flat_load_dword v0, v[0:1] 2456; GFX6-NEXT: v_mov_b32_e32 v1, s3 2457; GFX6-NEXT: s_waitcnt vmcnt(0) 2458; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 2459; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 2460; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2461; GFX6-NEXT: flat_store_dword v[0:1], v3 2462; GFX6-NEXT: s_endpgm 2463; 2464; GFX8-LABEL: test_canonicalize_value_f32_denorm: 2465; GFX8: ; %bb.0: 2466; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2467; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2468; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2469; GFX8-NEXT: v_mov_b32_e32 v1, s1 2470; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2471; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2472; GFX8-NEXT: flat_load_dword v0, v[0:1] 2473; GFX8-NEXT: v_mov_b32_e32 v1, s3 2474; GFX8-NEXT: s_waitcnt vmcnt(0) 2475; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 2476; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2477; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2478; GFX8-NEXT: flat_store_dword v[0:1], v3 2479; GFX8-NEXT: s_endpgm 2480; 2481; GFX9-LABEL: test_canonicalize_value_f32_denorm: 2482; GFX9: ; %bb.0: 2483; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2484; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2485; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2486; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 2487; GFX9-NEXT: s_waitcnt vmcnt(0) 2488; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 2489; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 2490; GFX9-NEXT: s_endpgm 2491; 2492; GFX11-LABEL: test_canonicalize_value_f32_denorm: 2493; GFX11: ; %bb.0: 2494; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2495; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2496; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2497; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2498; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2499; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] 2500; GFX11-NEXT: s_waitcnt vmcnt(0) 2501; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 2502; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] 2503; GFX11-NEXT: s_endpgm 2504; 2505; GFX12-LABEL: test_canonicalize_value_f32_denorm: 2506; GFX12: ; %bb.0: 2507; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2508; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2509; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2510; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2511; GFX12-NEXT: s_wait_kmcnt 0x0 2512; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] 2513; GFX12-NEXT: s_wait_loadcnt 0x0 2514; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1 2515; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] 2516; GFX12-NEXT: s_endpgm 2517 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 2518 %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id 2519 %v = load float, ptr addrspace(1) %gep, align 4 2520 %canonicalized = tail call float @llvm.canonicalize.f32(float %v) 2521 %gep2 = getelementptr inbounds float, ptr addrspace(1) %out, i32 %id 2522 store float %canonicalized, ptr addrspace(1) %gep2, align 4 2523 ret void 2524} 2525 2526; FIXME: Conversion to float should count as the canonicalize pre-gfx8 2527define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { 2528; GFX6-LABEL: test_canonicalize_value_f16_denorm: 2529; GFX6: ; %bb.0: 2530; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2531; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 2532; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2533; GFX6-NEXT: v_mov_b32_e32 v1, s1 2534; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 2535; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2536; GFX6-NEXT: flat_load_ushort v0, v[0:1] 2537; GFX6-NEXT: v_mov_b32_e32 v1, s3 2538; GFX6-NEXT: s_waitcnt vmcnt(0) 2539; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 2540; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v0 2541; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 2542; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2543; GFX6-NEXT: flat_store_short v[0:1], v3 2544; GFX6-NEXT: s_endpgm 2545; 2546; GFX8-LABEL: test_canonicalize_value_f16_denorm: 2547; GFX8: ; %bb.0: 2548; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2549; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 2550; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2551; GFX8-NEXT: v_mov_b32_e32 v1, s1 2552; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2553; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2554; GFX8-NEXT: flat_load_ushort v0, v[0:1] 2555; GFX8-NEXT: v_mov_b32_e32 v1, s3 2556; GFX8-NEXT: s_waitcnt vmcnt(0) 2557; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 2558; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2559; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2560; GFX8-NEXT: flat_store_short v[0:1], v3 2561; GFX8-NEXT: s_endpgm 2562; 2563; GFX9-LABEL: test_canonicalize_value_f16_denorm: 2564; GFX9: ; %bb.0: 2565; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2566; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2567; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2568; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] 2569; GFX9-NEXT: s_waitcnt vmcnt(0) 2570; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 2571; GFX9-NEXT: global_store_short v0, v1, s[2:3] 2572; GFX9-NEXT: s_endpgm 2573; 2574; GFX11-LABEL: test_canonicalize_value_f16_denorm: 2575; GFX11: ; %bb.0: 2576; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2577; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2578; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2579; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2580; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2581; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] 2582; GFX11-NEXT: s_waitcnt vmcnt(0) 2583; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 2584; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] 2585; GFX11-NEXT: s_endpgm 2586; 2587; GFX12-LABEL: test_canonicalize_value_f16_denorm: 2588; GFX12: ; %bb.0: 2589; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2590; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2591; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2592; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2593; GFX12-NEXT: s_wait_kmcnt 0x0 2594; GFX12-NEXT: global_load_u16 v1, v0, s[0:1] 2595; GFX12-NEXT: s_wait_loadcnt 0x0 2596; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 2597; GFX12-NEXT: global_store_b16 v0, v1, s[2:3] 2598; GFX12-NEXT: s_endpgm 2599 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 2600 %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id 2601 %v = load half, ptr addrspace(1) %gep, align 2 2602 %canonicalized = tail call half @llvm.canonicalize.f16(half %v) 2603 %gep2 = getelementptr inbounds half, ptr addrspace(1) %out, i32 %id 2604 store half %canonicalized, ptr addrspace(1) %gep2, align 2 2605 ret void 2606} 2607 2608 2609 2610define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) %arg, ptr addrspace(1) %out) #3 { 2611; GFX6-LABEL: test_canonicalize_value_v2f16_denorm: 2612; GFX6: ; %bb.0: 2613; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2614; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2615; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2616; GFX6-NEXT: v_mov_b32_e32 v1, s1 2617; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 2618; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2619; GFX6-NEXT: flat_load_dword v0, v[0:1] 2620; GFX6-NEXT: v_mov_b32_e32 v3, s3 2621; GFX6-NEXT: s_waitcnt vmcnt(0) 2622; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 2623; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 2624; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 2625; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 2626; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 2627; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2628; GFX6-NEXT: v_or_b32_e32 v4, v0, v1 2629; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 2630; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 2631; GFX6-NEXT: flat_store_dword v[0:1], v4 2632; GFX6-NEXT: s_endpgm 2633; 2634; GFX8-LABEL: test_canonicalize_value_v2f16_denorm: 2635; GFX8: ; %bb.0: 2636; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2637; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2638; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2639; GFX8-NEXT: v_mov_b32_e32 v1, s1 2640; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2641; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2642; GFX8-NEXT: flat_load_dword v0, v[0:1] 2643; GFX8-NEXT: v_mov_b32_e32 v1, s3 2644; GFX8-NEXT: s_waitcnt vmcnt(0) 2645; GFX8-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2646; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 2647; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 2648; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2649; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2650; GFX8-NEXT: flat_store_dword v[0:1], v3 2651; GFX8-NEXT: s_endpgm 2652; 2653; GFX9-LABEL: test_canonicalize_value_v2f16_denorm: 2654; GFX9: ; %bb.0: 2655; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2656; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2657; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2658; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 2659; GFX9-NEXT: s_waitcnt vmcnt(0) 2660; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 2661; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 2662; GFX9-NEXT: s_endpgm 2663; 2664; GFX11-LABEL: test_canonicalize_value_v2f16_denorm: 2665; GFX11: ; %bb.0: 2666; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2667; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2668; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2669; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2670; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2671; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] 2672; GFX11-NEXT: s_waitcnt vmcnt(0) 2673; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 2674; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] 2675; GFX11-NEXT: s_endpgm 2676; 2677; GFX12-LABEL: test_canonicalize_value_v2f16_denorm: 2678; GFX12: ; %bb.0: 2679; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2680; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2681; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2682; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2683; GFX12-NEXT: s_wait_kmcnt 0x0 2684; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] 2685; GFX12-NEXT: s_wait_loadcnt 0x0 2686; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 2687; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] 2688; GFX12-NEXT: s_endpgm 2689 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 2690 %gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i32 %id 2691 %v = load <2 x half>, ptr addrspace(1) %gep, align 4 2692 %canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v) 2693 %gep2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i32 %id 2694 store <2 x half> %canonicalized, ptr addrspace(1) %gep2, align 2 2695 ret void 2696} 2697 2698define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) #1 { 2699; GFX6-LABEL: v_test_canonicalize_var_v2f64: 2700; GFX6: ; %bb.0: 2701; GFX6-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 2702; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 2703; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2704; GFX6-NEXT: v_mov_b32_e32 v1, s1 2705; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 2706; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2707; GFX6-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2708; GFX6-NEXT: v_mov_b32_e32 v5, s1 2709; GFX6-NEXT: v_mov_b32_e32 v4, s0 2710; GFX6-NEXT: s_waitcnt vmcnt(0) 2711; GFX6-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 2712; GFX6-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 2713; GFX6-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2714; GFX6-NEXT: s_endpgm 2715; 2716; GFX8-LABEL: v_test_canonicalize_var_v2f64: 2717; GFX8: ; %bb.0: 2718; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 2719; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 2720; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2721; GFX8-NEXT: v_mov_b32_e32 v1, s1 2722; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 2723; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2724; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2725; GFX8-NEXT: v_mov_b32_e32 v5, s1 2726; GFX8-NEXT: v_mov_b32_e32 v4, s0 2727; GFX8-NEXT: s_waitcnt vmcnt(0) 2728; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 2729; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 2730; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2731; GFX8-NEXT: s_endpgm 2732; 2733; GFX9-LABEL: v_test_canonicalize_var_v2f64: 2734; GFX9: ; %bb.0: 2735; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 2736; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 2737; GFX9-NEXT: v_mov_b32_e32 v4, 0 2738; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2739; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] 2740; GFX9-NEXT: s_waitcnt vmcnt(0) 2741; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 2742; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 2743; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2744; GFX9-NEXT: s_endpgm 2745; 2746; GFX11-LABEL: v_test_canonicalize_var_v2f64: 2747; GFX11: ; %bb.0: 2748; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2749; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2750; GFX11-NEXT: v_mov_b32_e32 v4, 0 2751; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 2752; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 2753; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2754; GFX11-NEXT: global_load_b128 v[0:3], v0, s[0:1] 2755; GFX11-NEXT: s_waitcnt vmcnt(0) 2756; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 2757; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 2758; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 2759; GFX11-NEXT: s_endpgm 2760; 2761; GFX12-LABEL: v_test_canonicalize_var_v2f64: 2762; GFX12: ; %bb.0: 2763; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2764; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2765; GFX12-NEXT: v_mov_b32_e32 v4, 0 2766; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) 2767; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0 2768; GFX12-NEXT: s_wait_kmcnt 0x0 2769; GFX12-NEXT: global_load_b128 v[0:3], v0, s[0:1] 2770; GFX12-NEXT: s_wait_loadcnt 0x0 2771; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] 2772; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] 2773; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] 2774; GFX12-NEXT: s_endpgm 2775 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2776 %gep = getelementptr <2 x double>, ptr addrspace(1) %out, i32 %tid 2777 %val = load <2 x double>, ptr addrspace(1) %gep 2778 %canonicalized = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> %val) 2779 store <2 x double> %canonicalized, ptr addrspace(1) %out 2780 ret void 2781} 2782 2783 2784define <2 x float> @v_test_canonicalize_v2f32_flush(<2 x float> %arg) #1 { 2785; GFX678-LABEL: v_test_canonicalize_v2f32_flush: 2786; GFX678: ; %bb.0: 2787; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2788; GFX678-NEXT: v_mul_f32_e32 v0, 1.0, v0 2789; GFX678-NEXT: v_mul_f32_e32 v1, 1.0, v1 2790; GFX678-NEXT: s_setpc_b64 s[30:31] 2791; 2792; GFX9-LABEL: v_test_canonicalize_v2f32_flush: 2793; GFX9: ; %bb.0: 2794; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2795; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 2796; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 2797; GFX9-NEXT: s_setpc_b64 s[30:31] 2798; 2799; GFX11-LABEL: v_test_canonicalize_v2f32_flush: 2800; GFX11: ; %bb.0: 2801; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2802; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 2803; GFX11-NEXT: s_setpc_b64 s[30:31] 2804; 2805; GFX12-LABEL: v_test_canonicalize_v2f32_flush: 2806; GFX12: ; %bb.0: 2807; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2808; GFX12-NEXT: s_wait_expcnt 0x0 2809; GFX12-NEXT: s_wait_samplecnt 0x0 2810; GFX12-NEXT: s_wait_bvhcnt 0x0 2811; GFX12-NEXT: s_wait_kmcnt 0x0 2812; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 2813; GFX12-NEXT: s_setpc_b64 s[30:31] 2814 %canon = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> %arg) 2815 ret <2 x float> %canon 2816} 2817 2818 2819define <3 x float> @v_test_canonicalize_v3f32_flush(<3 x float> %arg) #1 { 2820; GFX678-LABEL: v_test_canonicalize_v3f32_flush: 2821; GFX678: ; %bb.0: 2822; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2823; GFX678-NEXT: v_mul_f32_e32 v0, 1.0, v0 2824; GFX678-NEXT: v_mul_f32_e32 v1, 1.0, v1 2825; GFX678-NEXT: v_mul_f32_e32 v2, 1.0, v2 2826; GFX678-NEXT: s_setpc_b64 s[30:31] 2827; 2828; GFX9-LABEL: v_test_canonicalize_v3f32_flush: 2829; GFX9: ; %bb.0: 2830; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2831; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 2832; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 2833; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 2834; GFX9-NEXT: s_setpc_b64 s[30:31] 2835; 2836; GFX11-LABEL: v_test_canonicalize_v3f32_flush: 2837; GFX11: ; %bb.0: 2838; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2839; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 2840; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 2841; GFX11-NEXT: s_setpc_b64 s[30:31] 2842; 2843; GFX12-LABEL: v_test_canonicalize_v3f32_flush: 2844; GFX12: ; %bb.0: 2845; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2846; GFX12-NEXT: s_wait_expcnt 0x0 2847; GFX12-NEXT: s_wait_samplecnt 0x0 2848; GFX12-NEXT: s_wait_bvhcnt 0x0 2849; GFX12-NEXT: s_wait_kmcnt 0x0 2850; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 2851; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 2852; GFX12-NEXT: s_setpc_b64 s[30:31] 2853 %canon = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> %arg) 2854 ret <3 x float> %canon 2855} 2856 2857 2858define <4 x float> @v_test_canonicalize_v4f32_flush(<4 x float> %arg) #1 { 2859; GFX678-LABEL: v_test_canonicalize_v4f32_flush: 2860; GFX678: ; %bb.0: 2861; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2862; GFX678-NEXT: v_mul_f32_e32 v0, 1.0, v0 2863; GFX678-NEXT: v_mul_f32_e32 v1, 1.0, v1 2864; GFX678-NEXT: v_mul_f32_e32 v2, 1.0, v2 2865; GFX678-NEXT: v_mul_f32_e32 v3, 1.0, v3 2866; GFX678-NEXT: s_setpc_b64 s[30:31] 2867; 2868; GFX9-LABEL: v_test_canonicalize_v4f32_flush: 2869; GFX9: ; %bb.0: 2870; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2871; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 2872; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 2873; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 2874; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 2875; GFX9-NEXT: s_setpc_b64 s[30:31] 2876; 2877; GFX11-LABEL: v_test_canonicalize_v4f32_flush: 2878; GFX11: ; %bb.0: 2879; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2880; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 2881; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3 2882; GFX11-NEXT: s_setpc_b64 s[30:31] 2883; 2884; GFX12-LABEL: v_test_canonicalize_v4f32_flush: 2885; GFX12: ; %bb.0: 2886; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2887; GFX12-NEXT: s_wait_expcnt 0x0 2888; GFX12-NEXT: s_wait_samplecnt 0x0 2889; GFX12-NEXT: s_wait_bvhcnt 0x0 2890; GFX12-NEXT: s_wait_kmcnt 0x0 2891; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 2892; GFX12-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3 2893; GFX12-NEXT: s_setpc_b64 s[30:31] 2894 %canon = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %arg) 2895 ret <4 x float> %canon 2896} 2897 2898 2899define <8 x float> @v_test_canonicalize_v8f32_flush(<8 x float> %arg) #1 { 2900; GFX678-LABEL: v_test_canonicalize_v8f32_flush: 2901; GFX678: ; %bb.0: 2902; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2903; GFX678-NEXT: v_mul_f32_e32 v0, 1.0, v0 2904; GFX678-NEXT: v_mul_f32_e32 v1, 1.0, v1 2905; GFX678-NEXT: v_mul_f32_e32 v2, 1.0, v2 2906; GFX678-NEXT: v_mul_f32_e32 v3, 1.0, v3 2907; GFX678-NEXT: v_mul_f32_e32 v4, 1.0, v4 2908; GFX678-NEXT: v_mul_f32_e32 v5, 1.0, v5 2909; GFX678-NEXT: v_mul_f32_e32 v6, 1.0, v6 2910; GFX678-NEXT: v_mul_f32_e32 v7, 1.0, v7 2911; GFX678-NEXT: s_setpc_b64 s[30:31] 2912; 2913; GFX9-LABEL: v_test_canonicalize_v8f32_flush: 2914; GFX9: ; %bb.0: 2915; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2916; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 2917; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 2918; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 2919; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 2920; GFX9-NEXT: v_max_f32_e32 v4, v4, v4 2921; GFX9-NEXT: v_max_f32_e32 v5, v5, v5 2922; GFX9-NEXT: v_max_f32_e32 v6, v6, v6 2923; GFX9-NEXT: v_max_f32_e32 v7, v7, v7 2924; GFX9-NEXT: s_setpc_b64 s[30:31] 2925; 2926; GFX11-LABEL: v_test_canonicalize_v8f32_flush: 2927; GFX11: ; %bb.0: 2928; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2929; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 2930; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3 2931; GFX11-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5 2932; GFX11-NEXT: v_dual_max_f32 v6, v6, v6 :: v_dual_max_f32 v7, v7, v7 2933; GFX11-NEXT: s_setpc_b64 s[30:31] 2934; 2935; GFX12-LABEL: v_test_canonicalize_v8f32_flush: 2936; GFX12: ; %bb.0: 2937; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2938; GFX12-NEXT: s_wait_expcnt 0x0 2939; GFX12-NEXT: s_wait_samplecnt 0x0 2940; GFX12-NEXT: s_wait_bvhcnt 0x0 2941; GFX12-NEXT: s_wait_kmcnt 0x0 2942; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 2943; GFX12-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3 2944; GFX12-NEXT: v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5 2945; GFX12-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v7, v7, v7 2946; GFX12-NEXT: s_setpc_b64 s[30:31] 2947 %canon = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> %arg) 2948 ret <8 x float> %canon 2949} 2950 2951define <2 x double> @v_test_canonicalize_v2f64(<2 x double> %arg) #1 { 2952; GFX678-LABEL: v_test_canonicalize_v2f64: 2953; GFX678: ; %bb.0: 2954; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2955; GFX678-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 2956; GFX678-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 2957; GFX678-NEXT: s_setpc_b64 s[30:31] 2958; 2959; GFX9-LABEL: v_test_canonicalize_v2f64: 2960; GFX9: ; %bb.0: 2961; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2962; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 2963; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 2964; GFX9-NEXT: s_setpc_b64 s[30:31] 2965; 2966; GFX11-LABEL: v_test_canonicalize_v2f64: 2967; GFX11: ; %bb.0: 2968; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2969; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 2970; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 2971; GFX11-NEXT: s_setpc_b64 s[30:31] 2972; 2973; GFX12-LABEL: v_test_canonicalize_v2f64: 2974; GFX12: ; %bb.0: 2975; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2976; GFX12-NEXT: s_wait_expcnt 0x0 2977; GFX12-NEXT: s_wait_samplecnt 0x0 2978; GFX12-NEXT: s_wait_bvhcnt 0x0 2979; GFX12-NEXT: s_wait_kmcnt 0x0 2980; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] 2981; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] 2982; GFX12-NEXT: s_setpc_b64 s[30:31] 2983 %canon = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> %arg) 2984 ret <2 x double> %canon 2985} 2986 2987define <3 x double> @v_test_canonicalize_v3f64(<3 x double> %arg) #1 { 2988; GFX678-LABEL: v_test_canonicalize_v3f64: 2989; GFX678: ; %bb.0: 2990; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2991; GFX678-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 2992; GFX678-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 2993; GFX678-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] 2994; GFX678-NEXT: s_setpc_b64 s[30:31] 2995; 2996; GFX9-LABEL: v_test_canonicalize_v3f64: 2997; GFX9: ; %bb.0: 2998; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2999; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 3000; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 3001; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] 3002; GFX9-NEXT: s_setpc_b64 s[30:31] 3003; 3004; GFX11-LABEL: v_test_canonicalize_v3f64: 3005; GFX11: ; %bb.0: 3006; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3007; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 3008; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 3009; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] 3010; GFX11-NEXT: s_setpc_b64 s[30:31] 3011; 3012; GFX12-LABEL: v_test_canonicalize_v3f64: 3013; GFX12: ; %bb.0: 3014; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3015; GFX12-NEXT: s_wait_expcnt 0x0 3016; GFX12-NEXT: s_wait_samplecnt 0x0 3017; GFX12-NEXT: s_wait_bvhcnt 0x0 3018; GFX12-NEXT: s_wait_kmcnt 0x0 3019; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] 3020; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] 3021; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5] 3022; GFX12-NEXT: s_setpc_b64 s[30:31] 3023 %canon = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> %arg) 3024 ret <3 x double> %canon 3025} 3026 3027define <4 x double> @v_test_canonicalize_v4f64(<4 x double> %arg) #1 { 3028; GFX678-LABEL: v_test_canonicalize_v4f64: 3029; GFX678: ; %bb.0: 3030; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3031; GFX678-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 3032; GFX678-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 3033; GFX678-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] 3034; GFX678-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] 3035; GFX678-NEXT: s_setpc_b64 s[30:31] 3036; 3037; GFX9-LABEL: v_test_canonicalize_v4f64: 3038; GFX9: ; %bb.0: 3039; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3040; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 3041; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 3042; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] 3043; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] 3044; GFX9-NEXT: s_setpc_b64 s[30:31] 3045; 3046; GFX11-LABEL: v_test_canonicalize_v4f64: 3047; GFX11: ; %bb.0: 3048; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3049; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 3050; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 3051; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] 3052; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] 3053; GFX11-NEXT: s_setpc_b64 s[30:31] 3054; 3055; GFX12-LABEL: v_test_canonicalize_v4f64: 3056; GFX12: ; %bb.0: 3057; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3058; GFX12-NEXT: s_wait_expcnt 0x0 3059; GFX12-NEXT: s_wait_samplecnt 0x0 3060; GFX12-NEXT: s_wait_bvhcnt 0x0 3061; GFX12-NEXT: s_wait_kmcnt 0x0 3062; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] 3063; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] 3064; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5] 3065; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[6:7] 3066; GFX12-NEXT: s_setpc_b64 s[30:31] 3067 %canon = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> %arg) 3068 ret <4 x double> %canon 3069} 3070 3071attributes #0 = { nounwind readnone } 3072attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 3073attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } 3074attributes #3 = { nounwind "denormal-fp-math"="ieee,ieee" } 3075attributes #4 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } 3076attributes #5 = { nounwind "denormal-fp-math-f32"="dynamic,dynamic" } 3077attributes #6 = { nounwind "denormal-fp-math-f32"="dynamic,ieee" } 3078attributes #7 = { nounwind "denormal-fp-math-f32"="ieee,dynamic" } 3079